diff options
author | Daniel Genrich <daniel.genrich@gmx.net> | 2014-10-23 17:12:28 +0400 |
---|---|---|
committer | Daniel Genrich <daniel.genrich@gmx.net> | 2014-10-23 17:12:28 +0400 |
commit | 9ff1ebed52e0f858a395eeea4caf89304e068b2d (patch) | |
tree | b05d0f4b229de61b088a128ad412dd7bba347928 /intern/cycles | |
parent | a2ed11c6eeab5fab8cb81e32e1c68fdafdd5dbbc (diff) | |
parent | eaaeae469968c5c78a5d7e6d202f1af00b382a79 (diff) |
Merge remote-tracking branch 'origin/master' into soc-2014-fluid
Conflicts:
.gitignore
intern/cycles/CMakeLists.txt
source/blender/blenkernel/intern/smoke.c
source/blender/python/intern/bpy_interface.c
source/creator/CMakeLists.txt
Diffstat (limited to 'intern/cycles')
184 files changed, 8593 insertions, 6415 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 504d5a7b831..7de1182282d 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -14,14 +14,18 @@ include(cmake/external_libs.cmake) # todo: refactor this code to match scons # note: CXX_HAS_SSE is needed in case passing SSE flags fails altogether (gcc-arm) -if(WIN32 AND MSVC) +if(NOT WITH_CPU_SSE) + set(CXX_HAS_SSE FALSE) +elseif(WIN32 AND MSVC) set(CXX_HAS_SSE TRUE) # /arch:AVX for VC2012 and above if(NOT MSVC_VERSION LESS 1700) set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2") elseif(NOT CMAKE_CL_64) set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2") + set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() # there is no /arch:SSE3, but intrinsics are available anyway @@ -30,11 +34,13 @@ if(WIN32 AND MSVC) set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") else() set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") @@ -47,7 +53,8 @@ elseif(CMAKE_COMPILER_IS_GNUCC) set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") + set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mfpmath=sse") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") @@ -56,7 +63,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") + set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") + set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() @@ -67,14 +75,16 @@ if(CXX_HAS_SSE) -DWITH_KERNEL_SSE3 -DWITH_KERNEL_SSE41 -DWITH_KERNEL_AVX + -DWITH_KERNEL_AVX2 ) endif() -# for OSL -if(WIN32 AND MSVC) - set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") -elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang")) - set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") +if(WITH_CYCLES_OSL) + if(WIN32 AND MSVC) + set(RTTI_DISABLE_FLAGS "/GR- -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") + elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_C_COMPILER_ID MATCHES "Clang")) + set(RTTI_DISABLE_FLAGS "-fno-rtti -DBOOST_NO_RTTI -DBOOST_NO_TYPEID") + endif() endif() # Definitions and Includes @@ -108,7 +118,10 @@ endif() if(WITH_CYCLES_OSL) add_definitions(-DWITH_OSL) add_definitions(-DOSL_STATIC_LIBRARY) - include_directories(${OSL_INCLUDES}) + include_directories( + SYSTEM + ${OSL_INCLUDES} + ) endif() add_definitions( @@ -117,6 +130,30 @@ add_definitions( -DWITH_MULTI ) +# Logging capabilities using GLog library. +if(WITH_CYCLES_LOGGING) + add_definitions(-DWITH_CYCLES_LOGGING) + add_definitions(-DGOOGLE_GLOG_DLL_DECL=) + if(WIN32) + include_directories( + SYSTEM + ../../extern/libmv/third_party/glog/src/windows + ../../extern/libmv/third_party/gflags + ) + else() + include_directories( + SYSTEM + ../../extern/libmv/third_party/glog/src + ../../extern/libmv/third_party/gflags + ) + endif() +endif() + +# Debugging capabilities (debug passes etc). +if(WITH_CYCLES_DEBUG) + add_definitions(-DWITH_CYCLES_DEBUG) +endif() + include_directories( SYSTEM ${BOOST_INCLUDE_DIR} @@ -130,7 +167,9 @@ include_directories( # Warnings if(CMAKE_COMPILER_IS_GNUCXX) ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion") + ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion") unset(_has_cxxflag_float_conversion) + unset(_has_cxxflag_double_promotion) endif() diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript index 532238b9d7e..b399844534d 100644 --- a/intern/cycles/SConscript +++ b/intern/cycles/SConscript @@ -39,12 +39,13 @@ sources.remove(path.join('kernel', 'kernel_sse2.cpp')) sources.remove(path.join('kernel', 'kernel_sse3.cpp')) sources.remove(path.join('kernel', 'kernel_sse41.cpp')) sources.remove(path.join('kernel', 'kernel_avx.cpp')) +sources.remove(path.join('kernel', 'kernel_avx2.cpp')) incs = [] defs = [] cxxflags = Split(env['CXXFLAGS']) -defs.append('GLEW_STATIC') +defs += env['BF_GL_DEFINITIONS'] defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {') defs.append('CCL_NAMESPACE_END=}') @@ -58,10 +59,18 @@ if env['WITH_BF_CYCLES_OSL']: defs.append('OSL_STATIC_LIBRARY') incs.append(cycles['BF_OSL_INC']) +if env['WITH_BF_CYCLES_DEBUG']: + defs.append('WITH_CYCLES_DEBUG') + incs.extend('. bvh render device kernel kernel/osl kernel/svm util subd'.split()) incs.extend('#intern/guardedalloc #source/blender/makesrna #source/blender/makesdna #source/blender/blenlib'.split()) incs.extend('#source/blender/blenloader ../../source/blender/makesrna/intern'.split()) -incs.extend('#extern/glew/include #intern/mikktspace'.split()) + +incs.append(env['BF_GLEW_INC']) +incs.append('#/intern/glew-mx') +incs.append('#intern/mikktspace') +incs.extend('#extern/glew/include #extern/clew/include #extern/cuew/include #intern/mikktspace'.split()) + incs.append(cycles['BF_OIIO_INC']) incs.append(cycles['BF_BOOST_INC']) incs.append(cycles['BF_OPENEXR_INC'].split()) @@ -95,9 +104,10 @@ elif env['OURPLATFORM'] == 'win64-vc': kernel_flags['sse2'] = '-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /GS-' kernel_flags['sse3'] = kernel_flags['sse2'] - if env['MSVC_VERSION'] in ('11.0', '12.0'): + if env['MSVC_VERSION'] >= '12.0': kernel_flags['sse41'] = kernel_flags['sse3'] kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX' + kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2' else: # -mavx only available with relatively new gcc/clang kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse' @@ -106,6 +116,7 @@ else: if (env['C_COMPILER_ID'] == 'gcc' and env['CCVERSION'] >= '4.6') or (env['C_COMPILER_ID'] == 'clang' and env['CCVERSION'] >= '3.1'): kernel_flags['avx'] = kernel_flags['sse41'] + ' -mavx' + kernel_flags['avx2'] = kernel_flags['avx'] + ' -mavx2 -mfma -mlzcnt -mbmi -mbmi2' for kernel_type in kernel_flags.keys(): defs.append('WITH_KERNEL_' + kernel_type.upper()) diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 52806b0804b..c8464899725 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -21,15 +21,20 @@ set(LIBRARIES cycles_util ${BOOST_LIBRARIES} ${OPENEXR_LIBRARIES} - ${OPENGL_LIBRARIES} - ${CYCLES_GLEW_LIBRARY} + ${BLENDER_GL_LIBRARIES} + bf_intern_glew_mx + ${CYCLES_APP_GLEW_LIBRARY} ${OPENIMAGEIO_LIBRARIES} ${PNG_LIBRARIES} ${JPEG_LIBRARIES} ${ZLIB_LIBRARIES} ${TIFF_LIBRARY} + extern_clew + extern_cuew ) +add_definitions(${GL_DEFINITIONS}) + if(WIN32) list(APPEND LIBRARIES ${PTHREADS_LIBRARIES}) endif() diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp index 7ea1ca2d8fb..90333eb3fc5 100644 --- a/intern/cycles/app/cycles_standalone.cpp +++ b/intern/cycles/app/cycles_standalone.cpp @@ -373,9 +373,9 @@ static void options_parse(int argc, const char **argv) } if(ssname == "osl") - options.scene_params.shadingsystem = SceneParams::OSL; + options.scene_params.shadingsystem = SHADINGSYSTEM_OSL; else if(ssname == "svm") - options.scene_params.shadingsystem = SceneParams::SVM; + options.scene_params.shadingsystem = SHADINGSYSTEM_SVM; #ifndef WITH_CYCLES_STANDALONE_GUI options.session_params.background = true; @@ -408,7 +408,7 @@ static void options_parse(int argc, const char **argv) fprintf(stderr, "Unknown shading system: %s\n", ssname.c_str()); exit(EXIT_FAILURE); } - else if(options.scene_params.shadingsystem == SceneParams::OSL && options.session_params.device.type != DEVICE_CPU) { + else if(options.scene_params.shadingsystem == SHADINGSYSTEM_OSL && options.session_params.device.type != DEVICE_CPU) { fprintf(stderr, "OSL shading system only works with CPU device\n"); exit(EXIT_FAILURE); } diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index d5ef30e5c6f..431796e106b 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -304,7 +304,8 @@ static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node) xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps"); /* Various Settings */ - xml_read_bool(&integrator->no_caustics, node, "no_caustics"); + xml_read_bool(&integrator->caustics_reflective, node, "caustics_reflective"); + xml_read_bool(&integrator->caustics_refractive, node, "caustics_refractive"); xml_read_float(&integrator->filter_glossy, node, "filter_glossy"); xml_read_int(&integrator->seed, node, "seed"); @@ -329,6 +330,7 @@ static void xml_read_camera(const XMLReadState& state, pugi::xml_node node) xml_read_float(&cam->aperturesize, node, "aperturesize"); // 0.5*focallength/fstop xml_read_float(&cam->focaldistance, node, "focaldistance"); xml_read_float(&cam->shuttertime, node, "shuttertime"); + xml_read_float(&cam->aperture_ratio, node, "aperture_ratio"); if(xml_equal_string(node, "type", "orthographic")) cam->type = CAMERA_ORTHOGRAPHIC; @@ -509,8 +511,10 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug else if(string_iequals(node.name(), "mapping")) { snode = new MappingNode(); } - else if(string_iequals(node.name(), "ward_bsdf")) { - snode = new WardBsdfNode(); + else if(string_iequals(node.name(), "anisotropic_bsdf")) { + AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode(); + xml_read_enum(&aniso->distribution, AnisotropicBsdfNode::distribution_enum, node, "distribution"); + snode = aniso; } else if(string_iequals(node.name(), "diffuse_bsdf")) { snode = new DiffuseBsdfNode(); @@ -550,9 +554,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug snode = hair; } else if(string_iequals(node.name(), "emission")) { - EmissionNode *emission = new EmissionNode(); - xml_read_bool(&emission->total_power, node, "total_power"); - snode = emission; + snode = new EmissionNode(); } else if(string_iequals(node.name(), "ambient_occlusion")) { snode = new AmbientOcclusionNode(); @@ -635,6 +637,12 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug else if(string_iequals(node.name(), "separate_hsv")) { snode = new SeparateHSVNode(); } + else if(string_iequals(node.name(), "combine_xyz")) { + snode = new CombineHSVNode(); + } + else if(string_iequals(node.name(), "separate_xyz")) { + snode = new SeparateHSVNode(); + } else if(string_iequals(node.name(), "hsv")) { snode = new HSVNode(); } diff --git a/intern/cycles/app/io_export_cycles_xml.py b/intern/cycles/app/io_export_cycles_xml.py index e310d928b26..ad8fb9d3dd3 100644 --- a/intern/cycles/app/io_export_cycles_xml.py +++ b/intern/cycles/app/io_export_cycles_xml.py @@ -111,19 +111,29 @@ class ExportCyclesXML(bpy.types.Operator, ExportHelper): # generate mesh node nverts = "" verts = "" + uvs = "" P = "" for v in mesh.vertices: P += "%f %f %f " % (v.co[0], v.co[1], v.co[2]) - for i, f in enumerate(mesh.tessfaces): - nverts += str(len(f.vertices)) + " " + verts_and_uvs = zip(mesh.tessfaces, mesh.tessface_uv_textures.active.data) + + for f, uvf in verts_and_uvs: + vcount = len(f.vertices) + nverts += str(vcount) + " " for v in f.vertices: verts += str(v) + " " - verts += " " - - node = etree.Element('mesh', attrib={'nverts': nverts, 'verts': verts, 'P': P}) + + uvs += str(uvf.uv1[0]) + " " + str(uvf.uv1[1]) + " " + uvs += str(uvf.uv2[0]) + " " + str(uvf.uv2[1]) + " " + uvs += str(uvf.uv3[0]) + " " + str(uvf.uv3[1]) + " " + if vcount==4: + uvs += " " + str(uvf.uv4[0]) + " " + str(uvf.uv4[1]) + " " + + + node = etree.Element('mesh', attrib={'nverts': nverts.strip(), 'verts': verts.strip(), 'P': P, 'UV' : uvs.strip()}) # write to file write(node, filepath) @@ -139,3 +149,4 @@ def unregister(): if __name__ == "__main__": register() + diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h index 2772b9ac8a7..cfd0c3ef264 100644 --- a/intern/cycles/blender/CCL_api.h +++ b/intern/cycles/blender/CCL_api.h @@ -36,6 +36,10 @@ CCLDeviceInfo *CCL_compute_device_list(int device_type); void *CCL_python_module_init(void); +void CCL_init_logging(const char *argv0); +void CCL_start_debug_logging(void); +void CCL_logging_verbosity_set(int verbosity); + #ifdef __cplusplus } #endif diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 9a60152841e..e1d592d32b4 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -6,6 +6,7 @@ set(INC ../kernel/svm ../util ../subd + ../../glew-mx ../../guardedalloc ../../mikktspace ../../../source/blender/makesdna @@ -25,6 +26,7 @@ set(SRC blender_object.cpp blender_particles.cpp blender_curves.cpp + blender_logging.cpp blender_python.cpp blender_session.cpp blender_shader.cpp @@ -43,9 +45,10 @@ set(ADDON_FILES addon/presets.py addon/properties.py addon/ui.py + addon/version_update.py ) -add_definitions(-DGLEW_STATIC) +add_definitions(${GL_DEFINITIONS}) blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}") diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index 27d986900c8..8c60ea31053 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -31,6 +31,7 @@ bl_info = { import bpy from . import engine +from . import version_update class CyclesRender(bpy.types.RenderEngine): @@ -100,12 +101,16 @@ def register(): presets.register() bpy.utils.register_module(__name__) + bpy.app.handlers.version_update.append(version_update.do_versions) + def unregister(): from . import ui from . import properties from . import presets + bpy.app.handlers.version_update.remove(version_update.do_versions) + ui.unregister() properties.unregister() presets.unregister() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 25a9e97a99b..18235eca790 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def init(): _cycles.init(path, user_path) -def create(engine, data, scene, region=0, v3d=0, rv3d=0, preview_osl=False): +def create(engine, data, scene, region=None, v3d=None, rv3d=None, preview_osl=False): import bpy import _cycles @@ -65,6 +65,7 @@ def bake(engine, obj, pass_type, pixel_array, num_pixels, depth, result): if session is not None: _cycles.bake(engine.session, obj.as_pointer(), pass_type, pixel_array.as_pointer(), num_pixels, depth, result.as_pointer()) + def reset(engine, data, scene): import _cycles data = data.as_pointer() diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index 9991fdb8e3b..2ec65d7183a 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -33,12 +33,16 @@ class AddPresetIntegrator(AddPresetBase, Operator): preset_values = [ "cycles.max_bounces", "cycles.min_bounces", - "cycles.no_caustics", "cycles.diffuse_bounces", "cycles.glossy_bounces", "cycles.transmission_bounces", + "cycles.volume_bounces", "cycles.transparent_min_bounces", - "cycles.transparent_max_bounces" + "cycles.transparent_max_bounces", + "cycles.use_transparent_shadows", + "cycles.caustics_reflective", + "cycles.caustics_refractive", + "cycles.blur_glossy" ] preset_subdir = "cycles/integrator" @@ -66,10 +70,13 @@ class AddPresetSampling(AddPresetBase, Operator): "cycles.mesh_light_samples", "cycles.subsurface_samples", "cycles.volume_samples", - "cycles.no_caustics", - "cycles.blur_glossy", "cycles.use_square_samples", - "cycles.progressive" + "cycles.progressive", + "cycles.seed", + "cycles.sample_clamp_direct", + "cycles.sample_clamp_indirect", + "cycles.sample_all_lights_direct", + "cycles.sample_all_lights_indirect", ] preset_subdir = "cycles/sampling" diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 7205a272395..05a6f70d423 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -37,7 +37,7 @@ if _cycles.with_network: enum_feature_set = ( ('SUPPORTED', "Supported", "Only use finished and supported features"), - ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future"), + ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1), ) enum_displacement_methods = ( @@ -108,9 +108,15 @@ enum_integrator = ( ('PATH', "Path Tracing", "Pure path tracing integrator"), ) -enum_volume_homogeneous_sampling = ( - ('DISTANCE', "Distance", "Use Distance Sampling"), - ('EQUI_ANGULAR', "Equi-angular", "Use Equi-angular Sampling"), +enum_volume_sampling = ( + ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"), + ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"), + ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling for volumes where neither method is ideal"), + ) + +enum_volume_interpolation = ( + ('LINEAR', "Linear", "Good smoothness and speed"), + ('CUBIC', 'Cubic', 'Smoothed high quality interpolation, but slower') ) @@ -146,13 +152,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default='PATH', ) - cls.volume_homogeneous_sampling = EnumProperty( - name="Homogeneous Sampling", - description="Sampling method to use for homogeneous volumes", - items=enum_volume_homogeneous_sampling, - default='DISTANCE', - ) - cls.use_square_samples = BoolProperty( name="Square Samples", description="Square sampling values for easier artist control", @@ -236,7 +235,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): name="Volume Samples", description="Number of volume scattering samples to render for each AA sample", min=1, max=10000, - default=1, + default=0, ) cls.sampling_pattern = EnumProperty( @@ -265,11 +264,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=True, ) - cls.no_caustics = BoolProperty( - name="No Caustics", - description="Leave out caustics, resulting in a darker image with less noise", - default=False, + cls.caustics_reflective = BoolProperty( + name="Reflective Caustics", + description="Use reflective caustics, resulting in a brighter image (more noise but added realism)", + default=True, ) + + cls.caustics_refractive = BoolProperty( + name="Refractive Caustics", + description="Use refractive caustics, resulting in a brighter image (more noise but added realism)", + default=True, + ) + cls.blur_glossy = FloatProperty( name="Filter Glossy", description="Adaptively blur glossy shaders after blurry bounces, " @@ -315,7 +321,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): name="Volume Bounces", description="Maximum number of volumetric scattering events", min=0, max=1024, - default=1, + default=0, ) cls.transparent_min_bounces = IntProperty( @@ -550,6 +556,13 @@ class CyclesCameraSettings(bpy.types.PropertyGroup): subtype='ANGLE', default=0, ) + cls.aperture_ratio = FloatProperty( + name="Aperture Ratio", + description="Distortion to simulate anamorphic lens bokeh", + min=0.01, soft_min=1.0, soft_max=2.0, + default=1.0, + precision=4, + ) cls.panorama_type = EnumProperty( name="Panorama Type", description="Distortion to use for the calculation", @@ -602,6 +615,19 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup): "(not using any textures), for faster rendering", default=False, ) + cls.volume_sampling = EnumProperty( + name="Volume Sampling", + description="Sampling method to use for volumes", + items=enum_volume_sampling, + default='DISTANCE', + ) + + cls.volume_interpolation = EnumProperty( + name="Volume Interpolation", + description="Interpolation method to use for volumes", + items=enum_volume_interpolation, + default='LINEAR', + ) @classmethod def unregister(cls): @@ -672,6 +698,19 @@ class CyclesWorldSettings(bpy.types.PropertyGroup): "(not using any textures), for faster rendering", default=False, ) + cls.volume_sampling = EnumProperty( + name="Volume Sampling", + description="Sampling method to use for volumes", + items=enum_volume_sampling, + default='EQUIANGULAR', + ) + + cls.volume_interpolation = EnumProperty( + name="Volume Interpolation", + description="Interpolation method to use for volumes", + items=enum_volume_interpolation, + default='LINEAR', + ) @classmethod def unregister(cls): @@ -718,6 +757,11 @@ class CyclesVisibilitySettings(bpy.types.PropertyGroup): description="Object visibility for shadow rays", default=True, ) + cls.scatter = BoolProperty( + name="Volume Scatter", + description="Object visibility for volume scatter rays", + default=True, + ) @classmethod def unregister(cls): diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 5c8115b6612..6a08b47b01f 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.prop(cscene, "subsurface_samples", text="Subsurface") sub.prop(cscene, "volume_samples", text="Volume") - if cscene.feature_set == 'EXPERIMENTAL' and use_cpu(context): + if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL': layout.row().prop(cscene, "sampling_pattern", text="Pattern") for rl in scene.render.layers: @@ -176,16 +176,11 @@ class CyclesRender_PT_volume_sampling(CyclesButtonsPanel, Panel): scene = context.scene cscene = scene.cycles - split = layout.split(align=True) - - sub = split.column(align=True) - sub.label("Heterogeneous:") - sub.prop(cscene, "volume_step_size") - sub.prop(cscene, "volume_max_steps") - - sub = split.column(align=True) - sub.label("Homogeneous:") - sub.prop(cscene, "volume_homogeneous_sampling", text="") + row = layout.row() + row.label("Heterogeneous:") + row = layout.row() + row.prop(cscene, "volume_step_size") + row.prop(cscene, "volume_max_steps") class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): @@ -215,7 +210,8 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): col.separator() - col.prop(cscene, "no_caustics") + col.prop(cscene, "caustics_reflective") + col.prop(cscene, "caustics_refractive") col.prop(cscene, "blur_glossy") col = split.column() @@ -473,6 +469,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.prop(ccam, "aperture_blades", text="Blades") sub.prop(ccam, "aperture_rotation", text="Rotation") + sub.prop(ccam, "aperture_ratio", text="Ratio") class Cycles_PT_context_material(CyclesButtonsPanel, Panel): @@ -570,8 +567,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): layout = self.layout rd = context.scene.render - scene = context.scene - # cscene = scene.cycles + # scene = context.scene layout.active = rd.use_motion_blur @@ -584,8 +580,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): layout = self.layout rd = context.scene.render - scene = context.scene - # cscene = scene.cycles + # scene = context.scene ob = context.object cob = ob.cycles @@ -624,6 +619,7 @@ class CyclesObject_PT_ray_visibility(CyclesButtonsPanel, Panel): flow.prop(visibility, "diffuse") flow.prop(visibility, "glossy") flow.prop(visibility, "transmission") + flow.prop(visibility, "scatter") if ob.type != 'LAMP': flow.prop(visibility, "shadow") @@ -636,7 +632,8 @@ class CYCLES_OT_use_shading_nodes(Operator): @classmethod def poll(cls, context): - return context.material or context.world or context.lamp + return (getattr(context, "material", False) or getattr(context, "world", False) or + getattr(context, "lamp", False)) def execute(self, context): if context.material: @@ -829,8 +826,6 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel): world = context.world panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume') - layout.prop(world.cycles, "homogeneous_volume") - class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel): bl_label = "Ambient Occlusion" @@ -904,6 +899,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel): flow.prop(visibility, "diffuse") flow.prop(visibility, "glossy") flow.prop(visibility, "transmission") + flow.prop(visibility, "scatter") class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): @@ -922,15 +918,27 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): cworld = world.cycles cscene = context.scene.cycles - col = layout.column() + split = layout.split() - col.prop(cworld, "sample_as_light") - sub = col.row(align=True) + col = split.column() + + col.label(text="Surface:") + col.prop(cworld, "sample_as_light", text="Multiple Importance") + + sub = col.column(align=True) sub.active = cworld.sample_as_light sub.prop(cworld, "sample_map_resolution") if cscene.progressive == 'BRANCHED_PATH': sub.prop(cworld, "samples") + col = split.column() + col.label(text="Volume:") + sub = col.column() + sub.active = use_cpu(context) + sub.prop(cworld, "volume_sampling", text="") + sub.prop(cworld, "volume_interpolation", text="") + col.prop(cworld, "homogeneous_volume", text="Homogeneous") + class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" @@ -975,12 +983,10 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel): layout = self.layout mat = context.material - cmat = mat.cycles + # cmat = mat.cycles panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume') - layout.prop(cmat, "homogeneous_volume") - class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel): bl_label = "Displacement" @@ -1023,10 +1029,21 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel): col.label() col.prop(mat, "pass_index") - col = layout.column() - col.prop(cmat, "sample_as_light") + split = layout.split() + + col = split.column() + col.label(text="Surface:") + col.prop(cmat, "sample_as_light", text="Multiple Importance") col.prop(cmat, "use_transparent_shadow") + col = split.column() + col.label(text="Volume:") + sub = col.column() + sub.active = use_cpu(context) + sub.prop(cmat, "volume_sampling", text="") + col.prop(cmat, "volume_interpolation", text="") + col.prop(cmat, "homogeneous_volume", text="Homogeneous") + class CyclesTexture_PT_context(CyclesButtonsPanel, Panel): bl_label = "" @@ -1194,8 +1211,6 @@ class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel): @classmethod def poll(cls, context): - scene = context.scene - # cscene = scene.cycles psys = context.particle_system return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR' @@ -1238,38 +1253,39 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel): scene = context.scene cscene = scene.cycles - cbk = scene.render.bake - layout.operator("object.bake", icon='RENDER_STILL').type = \ - cscene.bake_type + layout.operator("object.bake", icon='RENDER_STILL').type = cscene.bake_type col = layout.column() col.prop(cscene, "bake_type") - col.separator() - split = layout.split() - sub = split.column() - sub.prop(cbk, "use_clear") - sub.prop(cbk, "margin") + split = layout.split() - sub = split.column() - sub.prop(cbk, "use_selected_to_active") - sub = sub.column() + col = split.column() + col.prop(cbk, "margin") + col.prop(cbk, "use_clear") + col = split.column() + col.prop(cbk, "use_selected_to_active") + sub = col.column() sub.active = cbk.use_selected_to_active - sub.prop(cbk, "cage_extrusion", text="Distance") - sub.prop_search(cbk, "cage", scene, "objects") + sub.prop(cbk, "use_cage", text="Cage") + if cbk.use_cage: + sub.prop(cbk, "cage_extrusion", text="Extrusion") + sub.prop_search(cbk, "cage_object", scene, "objects", text="") + else: + sub.prop(cbk, "cage_extrusion", text="Ray Distance") if cscene.bake_type == 'NORMAL': - col.separator() - box = col.box() + layout.separator() + box = layout.box() box.label(text="Normal Settings:") box.prop(cbk, "normal_space", text="Space") row = box.row(align=True) - row.label(text = "Swizzle:") + row.label(text="Swizzle:") row.prop(cbk, "normal_r", text="") row.prop(cbk, "normal_g", text="") row.prop(cbk, "normal_b", text="") @@ -1282,7 +1298,6 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): @classmethod def poll(cls, context): scene = context.scene - # cscene = scene.cycles ccscene = scene.cycles_curves psys = context.particle_system use_curves = ccscene.use_curves and psys @@ -1368,7 +1383,11 @@ def get_panels(): "RENDER_PT_encoding", "RENDER_PT_dimensions", "RENDER_PT_stamp", + "RENDER_PT_freestyle", "RENDERLAYER_PT_layers", + "RENDERLAYER_PT_freestyle", + "RENDERLAYER_PT_freestyle_lineset", + "RENDERLAYER_PT_freestyle_linestyle", "SCENE_PT_scene", "SCENE_PT_color_management", "SCENE_PT_custom_props", @@ -1406,6 +1425,7 @@ def get_panels(): "DATA_PT_custom_props_curve", "DATA_PT_custom_props_lattice", "DATA_PT_custom_props_metaball", + "TEXTURE_PT_preview", "TEXTURE_PT_custom_props", "TEXTURE_PT_clouds", "TEXTURE_PT_wood", @@ -1423,6 +1443,7 @@ def get_panels(): "TEXTURE_PT_pointdensity", "TEXTURE_PT_pointdensity_turbulence", "TEXTURE_PT_mapping", + "TEXTURE_PT_ocean", "TEXTURE_PT_influence", "TEXTURE_PT_colors", "PARTICLE_PT_context_particles", @@ -1444,6 +1465,7 @@ def get_panels(): "PARTICLE_PT_force_fields", "PARTICLE_PT_vertexgroups", "MATERIAL_PT_custom_props", + "MATERIAL_PT_freestyle_line", "BONE_PT_custom_props", "OBJECT_PT_custom_props", ] diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py new file mode 100644 index 00000000000..eaeec703ff5 --- /dev/null +++ b/intern/cycles/blender/addon/version_update.py @@ -0,0 +1,59 @@ +# +# Copyright 2011-2014 Blender Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +# + +# <pep8 compliant> + +import bpy + +from bpy.app.handlers import persistent + + +@persistent +def do_versions(self): + # We don't modify startup file because it assumes to + # have all the default values only. + if not bpy.data.is_saved: + return + + # Clamp Direct/Indirect separation in 270 + if bpy.data.version <= (2, 70, 0): + for scene in bpy.data.scenes: + cscene = scene.cycles + sample_clamp = cscene.get("sample_clamp", False) + if (sample_clamp and + not cscene.is_property_set("sample_clamp_direct") and + not cscene.is_property_set("sample_clamp_indirect")): + + cscene.sample_clamp_direct = sample_clamp + cscene.sample_clamp_indirect = sample_clamp + + # Change of Volume Bounces in 271 + if bpy.data.version <= (2, 71, 0): + for scene in bpy.data.scenes: + cscene = scene.cycles + if not cscene.is_property_set("volume_bounces"): + cscene.volume_bounces = 1 + + # Caustics Reflective/Refractive separation in 272 + if bpy.data.version <= (2, 72, 0): + for scene in bpy.data.scenes: + cscene = scene.cycles + if (cscene.get("no_caustics", False) and + not cscene.is_property_set("caustics_reflective") and + not cscene.is_property_set("caustics_refractive")): + + cscene.caustics_reflective = False + cscene.caustics_refractive = False diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index 1a85561c6d5..ce8c64c4819 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -46,6 +46,8 @@ struct BlenderCamera { float2 pixelaspect; + float aperture_ratio; + PanoramaType panorama_type; float fisheye_fov; float fisheye_lens; @@ -167,6 +169,7 @@ static void blender_camera_from_object(BlenderCamera *bcam, BL::Object b_ob, boo bcam->apertureblades = RNA_int_get(&ccamera, "aperture_blades"); bcam->aperturerotation = RNA_float_get(&ccamera, "aperture_rotation"); bcam->focaldistance = blender_camera_focal_distance(b_ob, b_camera); + bcam->aperture_ratio = RNA_float_get(&ccamera, "aperture_ratio"); bcam->shift.x = b_camera.shift_x(); bcam->shift.y = b_camera.shift_y(); @@ -328,6 +331,9 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int cam->fisheye_fov = bcam->fisheye_fov; cam->fisheye_lens = bcam->fisheye_lens; + /* anamorphic lens bokeh */ + cam->aperture_ratio = bcam->aperture_ratio; + /* perspective */ cam->fov = 2.0f * atanf((0.5f * sensor_size) / bcam->lens / aspectratio); cam->focaldistance = bcam->focaldistance; diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index 22de7b64273..8cfaea59a06 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -15,10 +15,11 @@ */ #include "attribute.h" +#include "camera.h" +#include "curves.h" #include "mesh.h" #include "object.h" #include "scene.h" -#include "curves.h" #include "blender_sync.h" #include "blender_util.h" @@ -39,10 +40,11 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num); bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background); void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData); -void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotCam); +void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, + float3 RotCam, bool is_ortho); void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution); void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata); -void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *fdata); +void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata); ParticleCurveData::ParticleCurveData() { @@ -328,7 +330,8 @@ static void set_resolution(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, BL::S } } -void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotCam) +void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, + float3 RotCam, bool is_ortho) { int vertexno = mesh->verts.size(); int vertexindex = vertexno; @@ -362,7 +365,10 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotC float3 ickey_loc = CData->curvekey_co[CData->curve_firstkey[curve]]; float radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], 0.0f); v1 = CData->curvekey_co[CData->curve_firstkey[curve] + 1] - CData->curvekey_co[CData->curve_firstkey[curve]]; - xbasis = normalize(cross(RotCam - ickey_loc,v1)); + if(is_ortho) + xbasis = normalize(cross(RotCam, v1)); + else + xbasis = normalize(cross(RotCam - ickey_loc, v1)); float3 ickey_loc_shfl = ickey_loc - radius * xbasis; float3 ickey_loc_shfr = ickey_loc + radius * xbasis; mesh->verts.push_back(ickey_loc_shfl); @@ -386,7 +392,10 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, float3 RotC if(CData->psys_closetip[sys] && (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)) radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f); - xbasis = normalize(cross(RotCam - ickey_loc,v1)); + if(is_ortho) + xbasis = normalize(cross(RotCam, v1)); + else + xbasis = normalize(cross(RotCam - ickey_loc, v1)); float3 ickey_loc_shfl = ickey_loc - radius * xbasis; float3 ickey_loc_shfr = ickey_loc + radius * xbasis; mesh->verts.push_back(ickey_loc_shfl); @@ -726,9 +735,9 @@ void ExportCurveTriangleUV(Mesh *mesh, ParticleCurveData *CData, int vert_offset } } -void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, float3 *fdata) +void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata) { - if(fdata == NULL) + if(cdata == NULL) return; int vertexindex = vert_offset; @@ -740,17 +749,17 @@ void ExportCurveTriangleVcol(Mesh *mesh, ParticleCurveData *CData, int vert_offs for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) { for(int section = 0; section < resol; section++) { - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; - fdata[vertexindex] = color_srgb_to_scene_linear(CData->curve_vcol[curve]); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); vertexindex++; } } @@ -858,20 +867,26 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool ObtainCacheParticleData(mesh, &b_mesh, &b_ob, &CData, !preview); - /* obtain camera parameters */ - BL::Object b_CamOb = b_scene.camera(); - float3 RotCam = make_float3(0.0f, 0.0f, 0.0f); - if(b_CamOb) { - Transform ctfm = get_transform(b_CamOb.matrix_world()); - Transform tfm = get_transform(b_ob.matrix_world()); - Transform itfm = transform_quick_inverse(tfm); - RotCam = transform_point(&itfm, make_float3(ctfm.x.w, ctfm.y.w, ctfm.z.w)); - } - /* add hair geometry to mesh */ if(primitive == CURVE_TRIANGLES) { - if(triangle_method == CURVE_CAMERA_TRIANGLES) - ExportCurveTrianglePlanes(mesh, &CData, RotCam); + if(triangle_method == CURVE_CAMERA_TRIANGLES) { + /* obtain camera parameters */ + float3 RotCam; + Camera *camera = scene->camera; + Transform &ctfm = camera->matrix; + if(camera->type == CAMERA_ORTHOGRAPHIC) { + RotCam = -make_float3(ctfm.x.z, ctfm.y.z, ctfm.z.z); + } + else { + Transform tfm = get_transform(b_ob.matrix_world()); + Transform itfm = transform_quick_inverse(tfm); + RotCam = transform_point(&itfm, make_float3(ctfm.x.w, + ctfm.y.w, + ctfm.z.w)); + } + bool is_ortho = camera->type == CAMERA_ORTHOGRAPHIC; + ExportCurveTrianglePlanes(mesh, &CData, RotCam, is_ortho); + } else { ExportCurveTriangleGeometry(mesh, &CData, resolution); used_res = resolution; @@ -923,13 +938,12 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh b_mesh, BL::Object b_ob, bool ObtainCacheParticleVcol(mesh, &b_mesh, &b_ob, &CData, !preview, vcol_num); if(primitive == CURVE_TRIANGLES) { - Attribute *attr_vcol = mesh->attributes.add( - ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER); + ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE); - float3 *fdata = attr_vcol->data_float3(); + uchar4 *cdata = attr_vcol->data_uchar4(); - ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, fdata); + ExportCurveTriangleVcol(mesh, &CData, tri_num * 3, used_res, cdata); } else { Attribute *attr_vcol = mesh->curve_attributes.add( diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp new file mode 100644 index 00000000000..d3f1accf099 --- /dev/null +++ b/intern/cycles/blender/blender_logging.cpp @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include "CCL_api.h" + +#include <stdio.h> + +#include "util_logging.h" + +#ifdef _MSC_VER +# define snprintf _snprintf +#endif + +void CCL_init_logging(const char *argv0) +{ +#ifdef WITH_CYCLES_LOGGING + /* Make it so FATAL messages are always print into console. */ + char severity_fatal[32]; + snprintf(severity_fatal, sizeof(severity_fatal), "%d", + google::GLOG_FATAL); + + google::InitGoogleLogging(argv0); + google::SetCommandLineOption("logtostderr", "1"); + google::SetCommandLineOption("v", "0"); + google::SetCommandLineOption("stderrthreshold", severity_fatal); + google::SetCommandLineOption("minloglevel", severity_fatal); +#else + (void) argv0; +#endif +} + +void CCL_start_debug_logging(void) +{ +#ifdef WITH_CYCLES_LOGGING + google::SetCommandLineOption("logtostderr", "1"); + google::SetCommandLineOption("v", "2"); + google::SetCommandLineOption("stderrthreshold", "1"); + google::SetCommandLineOption("minloglevel", "0"); +#endif +} + +void CCL_logging_verbosity_set(int verbosity) +{ +#ifdef WITH_CYCLES_LOGGING + char val[10]; + snprintf(val, sizeof(val), "%d", verbosity); + + google::SetCommandLineOption("v", val); +#else + (void) verbosity; +#endif +} diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 83514879477..a5e4b7bd2ae 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -35,14 +35,14 @@ CCL_NAMESPACE_BEGIN /* Tangent Space */ struct MikkUserData { - MikkUserData(const BL::Mesh mesh_, const BL::MeshTextureFaceLayer layer_, int num_faces_) + MikkUserData(const BL::Mesh mesh_, BL::MeshTextureFaceLayer *layer_, int num_faces_) : mesh(mesh_), layer(layer_), num_faces(num_faces_) { tangent.resize(num_faces*4); } BL::Mesh mesh; - BL::MeshTextureFaceLayer layer; + BL::MeshTextureFaceLayer *layer; int num_faces; vector<float4> tangent; }; @@ -78,26 +78,34 @@ static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], con static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float uv[2], const int face_num, const int vert_num) { MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTextureFace tf = userdata->layer.data[face_num]; - float3 tfuv; - - switch (vert_num) { - case 0: - tfuv = get_float3(tf.uv1()); - break; - case 1: - tfuv = get_float3(tf.uv2()); - break; - case 2: - tfuv = get_float3(tf.uv3()); - break; - default: - tfuv = get_float3(tf.uv4()); - break; + if(userdata->layer != NULL) { + BL::MeshTextureFace tf = userdata->layer->data[face_num]; + float3 tfuv; + + switch (vert_num) { + case 0: + tfuv = get_float3(tf.uv1()); + break; + case 1: + tfuv = get_float3(tf.uv2()); + break; + case 2: + tfuv = get_float3(tf.uv3()); + break; + default: + tfuv = get_float3(tf.uv4()); + break; + } + + uv[0] = tfuv.x; + uv[1] = tfuv.y; + } + else { + int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num]; + float3 orco = + get_float3(userdata->mesh.vertices[vert_idx].undeformed_co()); + map_to_sphere(&uv[0], &uv[1], orco[0], orco[1], orco[2]); } - - uv[0] = tfuv.x; - uv[1] = tfuv.y; } static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num) @@ -127,7 +135,7 @@ static void mikk_set_tangent_space(const SMikkTSpaceContext *context, const floa userdata->tangent[face*4 + vert] = make_float4(T[0], T[1], T[2], sign); } -static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_layer, Mesh *mesh, vector<int>& nverts, bool need_sign, bool active_render) +static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer *b_layer, Mesh *mesh, vector<int>& nverts, bool need_sign, bool active_render) { /* setup userdata */ MikkUserData userdata(b_mesh, b_layer, nverts.size()); @@ -153,7 +161,11 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la /* create tangent attributes */ Attribute *attr; - ustring name = ustring((string(b_layer.name().c_str()) + ".tangent").c_str()); + ustring name; + if(b_layer != NULL) + name = ustring((string(b_layer->name().c_str()) + ".tangent").c_str()); + else + name = ustring("orco.tangent"); if(active_render) attr = mesh->attributes.add(ATTR_STD_UV_TANGENT, name); @@ -167,7 +179,11 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la if(need_sign) { Attribute *attr_sign; - ustring name_sign = ustring((string(b_layer.name().c_str()) + ".tangent_sign").c_str()); + ustring name_sign; + if(b_layer != NULL) + name_sign = ustring((string(b_layer->name().c_str()) + ".tangent_sign").c_str()); + else + name_sign = ustring("orco.tangent_sign"); if(active_render) attr_sign = mesh->attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign); @@ -208,7 +224,7 @@ static void mikk_compute_tangents(BL::Mesh b_mesh, BL::MeshTextureFaceLayer b_la /* Create Volume Attribute */ -static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std) +static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManager *image_manager, AttributeStandard std, float frame) { BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob); @@ -222,22 +238,22 @@ static void create_mesh_volume_attribute(BL::Object b_ob, Mesh *mesh, ImageManag volume_data->manager = image_manager; volume_data->slot = image_manager->add_image(Attribute::standard_name(std), - b_ob.ptr.data, animated, is_float, is_linear, INTERPOLATION_LINEAR, true); + b_ob.ptr.data, animated, frame, is_float, is_linear, INTERPOLATION_LINEAR, true); } -static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh) +static void create_mesh_volume_attributes(Scene *scene, BL::Object b_ob, Mesh *mesh, float frame) { /* for smoke volume rendering */ if(mesh->need_attribute(scene, ATTR_STD_VOLUME_DENSITY)) - create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY); + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_DENSITY, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_COLOR)) - create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR); + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_COLOR, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_FLAME)) - create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME); + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT)) - create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT); + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY)) - create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY); + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame); } /* Create Mesh */ @@ -347,31 +363,31 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector< continue; Attribute *attr = mesh->attributes.add( - ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER); + ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE); BL::MeshColorLayer::data_iterator c; - float3 *fdata = attr->data_float3(); + uchar4 *cdata = attr->data_uchar4(); size_t i = 0; for(l->data.begin(c); c != l->data.end(); ++c, ++i) { - fdata[0] = color_srgb_to_scene_linear(get_float3(c->color1())); - fdata[1] = color_srgb_to_scene_linear(get_float3(c->color2())); - fdata[2] = color_srgb_to_scene_linear(get_float3(c->color3())); + cdata[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1()))); + cdata[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2()))); + cdata[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3()))); if(nverts[i] == 4) { - fdata[3] = fdata[0]; - fdata[4] = fdata[2]; - fdata[5] = color_srgb_to_scene_linear(get_float3(c->color4())); - fdata += 6; + cdata[3] = cdata[0]; + cdata[4] = cdata[2]; + cdata[5] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4()))); + cdata += 6; } else - fdata += 3; + cdata += 3; } } } /* create uv map attributes */ - { + if (b_mesh.tessface_uv_textures.length() != 0) { BL::Mesh::tessface_uv_textures_iterator l; for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) { @@ -416,10 +432,14 @@ static void create_mesh(Scene *scene, Mesh *mesh, BL::Mesh b_mesh, const vector< name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str()); bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)); - mikk_compute_tangents(b_mesh, *l, mesh, nverts, need_sign, active_render); + mikk_compute_tangents(b_mesh, &(*l), mesh, nverts, need_sign, active_render); } } } + else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) { + bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN); + mikk_compute_tangents(b_mesh, NULL, mesh, nverts, need_sign, true); + } /* for volume objects, create a matrix to transform from object space to * mesh texture space. this does not work with deformations but that can @@ -505,15 +525,16 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri } /* test if we need to sync */ + bool use_mesh_geometry = render_layer.use_surfaces || render_layer.use_hair; Mesh *mesh; if(!mesh_map.sync(&mesh, key)) { - /* if transform was applied to mesh, need full update */ if(object_updated && mesh->transform_applied); /* test if shaders changed, these can be object level so mesh * does not get tagged for recalc */ else if(mesh->used_shaders != used_shaders); + else if(use_mesh_geometry != mesh->geometry_synced); else { /* even if not tagged for recalc, we may need to sync anyway * because the shader needs different mesh attributes */ @@ -540,15 +561,21 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri vector<Mesh::Triangle> oldtriangle = mesh->triangles; /* compares curve_keys rather than strands in order to handle quick hair - * adjustsments in dynamic BVH - other methods could probably do this better*/ + * adjustments in dynamic BVH - other methods could probably do this better*/ vector<float4> oldcurve_keys = mesh->curve_keys; mesh->clear(); mesh->used_shaders = used_shaders; mesh->name = ustring(b_ob_data.name().c_str()); - if(render_layer.use_surfaces || render_layer.use_hair) { - if(preview) + if(use_mesh_geometry) { + /* mesh objects does have special handle in the dependency graph, + * they're ensured to have properly updated. + * + * updating meshes here will end up having derived mesh referencing + * freed data from the blender side. + */ + if(preview && b_ob.type() != BL::Object::type_MESH) b_ob.update_from_editmode(); bool need_undeformed = mesh->need_attribute(scene, ATTR_STD_GENERATED); @@ -561,7 +588,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri else create_mesh(scene, mesh, b_mesh, used_shaders); - create_mesh_volume_attributes(scene, b_ob, mesh); + create_mesh_volume_attributes(scene, b_ob, mesh, b_scene.frame_current()); } if(render_layer.use_hair) @@ -570,6 +597,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated, bool hide_tri /* free derived mesh */ b_data.meshes.remove(b_mesh); } + mesh->geometry_synced = true; } /* displacement method */ @@ -616,6 +644,11 @@ void BlenderSync::sync_mesh_motion(BL::Object b_ob, Object *object, float motion mesh_motion_synced.insert(mesh); + /* ensure we only motion sync meshes that also had mesh synced, to avoid + * unnecessary work and to ensure that its attributes were clear */ + if(mesh_synced.find(mesh) == mesh_synced.end()) + return; + /* for motion pass always compute, for motion blur it can be disabled */ int time_index = 0; diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index 167647608a5..1e07c5f9c96 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -82,6 +82,7 @@ static uint object_ray_visibility(BL::Object b_ob) flag |= get_boolean(cvisibility, "glossy")? PATH_RAY_GLOSSY: 0; flag |= get_boolean(cvisibility, "transmission")? PATH_RAY_TRANSMIT: 0; flag |= get_boolean(cvisibility, "shadow")? PATH_RAY_SHADOW: 0; + flag |= get_boolean(cvisibility, "scatter")? PATH_RAY_VOLUME_SCATTER: 0; return flag; } @@ -172,6 +173,7 @@ void BlenderSync::sync_light(BL::Object b_parent, int persistent_id[OBJECT_PERSI light->use_diffuse = (visibility & PATH_RAY_DIFFUSE) != 0; light->use_glossy = (visibility & PATH_RAY_GLOSSY) != 0; light->use_transmission = (visibility & PATH_RAY_TRANSMIT) != 0; + light->use_scatter = (visibility & PATH_RAY_VOLUME_SCATTER) != 0; /* tag */ light->tag_update(scene); @@ -289,7 +291,6 @@ Object *BlenderSync::sync_object(BL::Object b_parent, int persistent_id[OBJECT_P uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY; if(b_parent.ptr.data != b_ob.ptr.data) { visibility &= object_ray_visibility(b_parent); - object->random_id ^= hash_int(hash_string(b_parent.name().c_str())); } /* make holdout objects on excluded layer invisible for non-camera rays */ @@ -446,7 +447,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time) light_map.pre_sync(); mesh_map.pre_sync(); object_map.pre_sync(); - mesh_synced.clear(); particle_system_map.pre_sync(); motion_times.clear(); } @@ -458,10 +458,10 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time) BL::Scene::object_bases_iterator b_base; BL::Scene b_sce = b_scene; /* modifier result type (not exposed as enum in C++ API) - * 1 : eModifierMode_Realtime - * 2 : eModifierMode_Render - */ - int dupli_settings = preview ? 1 : 2; + * 1 : DAG_EVAL_PREVIEW + * 2 : DAG_EVAL_RENDER + */ + int dupli_settings = preview ? 1 : 2; bool cancel = false; @@ -536,7 +536,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D b_v3d, float motion_time) scene->object_manager->tag_update(scene); if(particle_system_map.post_sync()) scene->particle_system_manager->tag_update(scene); - mesh_synced.clear(); } if(motion) @@ -578,7 +577,7 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void /* change frame */ python_thread_state_restore(python_thread_state); - b_scene.frame_set(frame, subframe); + b_engine.frame_set(frame, subframe); python_thread_state_save(python_thread_state); /* sync camera, only supports two times at the moment */ @@ -593,7 +592,7 @@ void BlenderSync::sync_motion(BL::SpaceView3D b_v3d, BL::Object b_override, void * function assumes it is being executed from python and will * try to save the thread state */ python_thread_state_restore(python_thread_state); - b_scene.frame_set(frame_center, 0.0f); + b_engine.frame_set(frame_center, 0.0f); python_thread_state_save(python_thread_state); /* tag camera for motion update */ diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 872f891cc2a..8e5a6c13f44 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -35,6 +35,13 @@ CCL_NAMESPACE_BEGIN +static void *pylong_as_voidptr_typesafe(PyObject *object) +{ + if(object == Py_None) + return NULL; + return PyLong_AsVoidPtr(object); +} + void python_thread_state_save(void **python_thread_state) { *python_thread_state = (void*)PyEval_SaveThread(); @@ -46,14 +53,36 @@ void python_thread_state_restore(void **python_thread_state) *python_thread_state = NULL; } +static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce) +{ +#ifdef WIN32 + /* bug [#31856] oddly enough, Python3.2 --> 3.3 on Windows will throw an + * exception here this needs to be fixed in python: + * see: bugs.python.org/issue15859 */ + if(!PyUnicode_Check(py_str)) { + PyErr_BadArgument(); + return ""; + } +#endif + if((*coerce = PyUnicode_EncodeFSDefault(py_str))) { + return PyBytes_AS_STRING(*coerce); + } + return ""; +} + static PyObject *init_func(PyObject *self, PyObject *args) { - const char *path, *user_path; + PyObject *path, *user_path; - if(!PyArg_ParseTuple(args, "ss", &path, &user_path)) + if(!PyArg_ParseTuple(args, "OO", &path, &user_path)) { return NULL; - - path_init(path, user_path); + } + + PyObject *path_coerce = NULL, *user_path_coerce = NULL; + path_init(PyC_UnicodeAsByte(path, &path_coerce), + PyC_UnicodeAsByte(user_path, &user_path_coerce)); + Py_XDECREF(path_coerce); + Py_XDECREF(user_path_coerce); Py_RETURN_NONE; } @@ -84,15 +113,15 @@ static PyObject *create_func(PyObject *self, PyObject *args) BL::Scene scene(sceneptr); PointerRNA regionptr; - RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyregion), ®ionptr); + RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyregion), ®ionptr); BL::Region region(regionptr); PointerRNA v3dptr; - RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyv3d), &v3dptr); + RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyv3d), &v3dptr); BL::SpaceView3D v3d(v3dptr); PointerRNA rv3dptr; - RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pyrv3d), &rv3dptr); + RNA_id_pointer_create((ID*)pylong_as_voidptr_typesafe(pyrv3d), &rv3dptr); BL::RegionView3D rv3d(rv3dptr); /* create session */ @@ -158,8 +187,6 @@ static PyObject *bake_func(PyObject *self, PyObject *args) if(!PyArg_ParseTuple(args, "OOsOiiO", &pysession, &pyobject, &pass_type, &pypixel_array, &num_pixels, &depth, &pyresult)) return NULL; - Py_BEGIN_ALLOW_THREADS - BlenderSession *session = (BlenderSession*)PyLong_AsVoidPtr(pysession); PointerRNA objectptr; @@ -172,9 +199,11 @@ static PyObject *bake_func(PyObject *self, PyObject *args) RNA_id_pointer_create((ID*)PyLong_AsVoidPtr(pypixel_array), &bakepixelptr); BL::BakePixel b_bake_pixel(bakepixelptr); - session->bake(b_object, pass_type, b_bake_pixel, num_pixels, depth, (float *)b_result); + python_thread_state_save(&session->python_thread_state); + + session->bake(b_object, pass_type, b_bake_pixel, (size_t)num_pixels, depth, (float *)b_result); - Py_END_ALLOW_THREADS + python_thread_state_restore(&session->python_thread_state); Py_RETURN_NONE; } @@ -356,7 +385,12 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args) /* find socket socket */ BL::NodeSocket b_sock(PointerRNA_NULL); if (param->isoutput) { +#if OSL_LIBRARY_VERSION_CODE < 10500 b_sock = b_node.outputs[param->name]; +#else + b_sock = b_node.outputs[param->name.string()]; +#endif + /* remove if type no longer matches */ if(b_sock && b_sock.bl_idname() != socket_type) { @@ -365,7 +399,11 @@ static PyObject *osl_update_node_func(PyObject *self, PyObject *args) } } else { +#if OSL_LIBRARY_VERSION_CODE < 10500 b_sock = b_node.inputs[param->name]; +#else + b_sock = b_node.inputs[param->name.string()]; +#endif /* remove if type no longer matches */ if(b_sock && b_sock.bl_idname() != socket_type) { diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 01a5acd8982..57ffea4b1a9 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -88,6 +88,7 @@ void BlenderSession::create_session() { SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); + bool session_pause = BlenderSync::get_session_pause(b_scene, background); /* reset status/progress */ last_status = ""; @@ -107,15 +108,17 @@ void BlenderSession::create_session() session->scene = scene; session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this)); session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this)); - session->set_pause(BlenderSync::get_session_pause(b_scene, background)); + session->set_pause(session_pause); /* create sync */ sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, session_params.device.type == DEVICE_CPU); if(b_v3d) { - /* full data sync */ - sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state); - sync->sync_view(b_v3d, b_rv3d, width, height); + if(session_pause == false) { + /* full data sync */ + sync->sync_view(b_v3d, b_rv3d, width, height); + sync->sync_data(b_v3d, b_engine.camera_override(), &python_thread_state); + } } else { /* for final render we will do full data sync per render layer, only @@ -258,6 +261,14 @@ static PassType get_pass_type(BL::RenderPass b_pass) case BL::RenderPass::type_SPECULAR: case BL::RenderPass::type_REFLECTION: return PASS_NONE; +#ifdef WITH_CYCLES_DEBUG + case BL::RenderPass::type_DEBUG: + { + if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS) + return PASS_BVH_TRAVERSAL_STEPS; + break; + } +#endif } return PASS_NONE; @@ -420,6 +431,9 @@ void BlenderSession::render() /* add passes */ vector<Pass> passes; Pass::add(PASS_COMBINED, passes); +#ifdef WITH_CYCLES_DEBUG + Pass::add(PASS_BVH_TRAVERSAL_STEPS, passes); +#endif if(session_params.device.advanced_shading) { @@ -492,38 +506,24 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const } } -static bool is_light_pass(ShaderEvalType type) -{ - switch (type) { - case SHADER_EVAL_AO: - case SHADER_EVAL_COMBINED: - case SHADER_EVAL_SHADOW: - case SHADER_EVAL_DIFFUSE_DIRECT: - case SHADER_EVAL_GLOSSY_DIRECT: - case SHADER_EVAL_TRANSMISSION_DIRECT: - case SHADER_EVAL_SUBSURFACE_DIRECT: - case SHADER_EVAL_DIFFUSE_INDIRECT: - case SHADER_EVAL_GLOSSY_INDIRECT: - case SHADER_EVAL_TRANSMISSION_INDIRECT: - case SHADER_EVAL_SUBSURFACE_INDIRECT: - return true; - default: - return false; - } -} - -void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[]) +void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float result[]) { ShaderEvalType shader_type = get_shader_type(pass_type); size_t object_index = OBJECT_NONE; int tri_offset = 0; + /* ensure kernels are loaded before we do any scene updates */ + session->load_kernels(); + + if(session->progress.get_cancel()) + return; + if(shader_type == SHADER_EVAL_UV) { /* force UV to be available */ Pass::add(PASS_UV, scene->film->passes); } - if(is_light_pass(shader_type)) { + if(BakeManager::is_light_pass(shader_type)) { /* force use_light_pass to be true */ Pass::add(PASS_LIGHT, scene->film->passes); } @@ -540,6 +540,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height); + scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y()); scene->bake_manager->set_baking(true); /* set number of samples */ @@ -568,6 +569,8 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake session->reset(buffer_params, session_params.samples); session->update_scene(); + session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this)); + scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_data, result); /* free all memory used (host and device), so we wouldn't leave render @@ -639,6 +642,7 @@ void BlenderSession::synchronize() /* on session/scene parameter changes, we recreate session entirely */ SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); + bool session_pause = BlenderSync::get_session_pause(b_scene, background); if(session->params.modified(session_params) || scene->params.modified(scene_params)) @@ -651,12 +655,18 @@ void BlenderSession::synchronize() /* increase samples, but never decrease */ session->set_samples(session_params.samples); - session->set_pause(BlenderSync::get_session_pause(b_scene, background)); + session->set_pause(session_pause); /* copy recalc flags, outside of mutex so we can decide to do the real * synchronization at a later time to not block on running updates */ sync->sync_recalc(); + /* don't do synchronization if on pause */ + if(session_pause) { + tag_update(); + return; + } + /* try to acquire mutex. if we don't want to or can't, come back later */ if(!session->ready_to_reset() || !session->scene->mutex.try_lock()) { tag_update(); @@ -732,10 +742,12 @@ bool BlenderSession::draw(int w, int h) if(reset) { SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height); + bool session_pause = BlenderSync::get_session_pause(b_scene, background); - session->reset(buffer_params, session_params.samples); - - start_resize_time = 0.0; + if(session_pause == false) { + session->reset(buffer_params, session_params.samples); + start_resize_time = 0.0; + } } } else { @@ -779,6 +791,26 @@ void BlenderSession::get_progress(float& progress, double& total_time) progress = 0.0; } +void BlenderSession::update_bake_progress() +{ + float progress; + int sample, samples_per_task, parts_total; + + sample = session->progress.get_sample(); + samples_per_task = scene->bake_manager->num_samples; + parts_total = scene->bake_manager->num_parts; + + if(samples_per_task) + progress = ((float)sample / (float)(parts_total * samples_per_task)); + else + progress = 0.0; + + if(progress != last_progress) { + b_engine.update_progress(progress); + last_progress = progress; + } +} + void BlenderSession::update_status_progress() { string timestatus, status, substatus; @@ -798,7 +830,7 @@ void BlenderSession::update_status_progress() if(background) { if(progress>0) - remaining_time = (1-progress) * (total_time / progress); + remaining_time = (1.0 - (double)progress) * (total_time / (double)progress); scene += " | " + b_scene.name(); if(b_rlay_name != "") @@ -817,7 +849,7 @@ void BlenderSession::update_status_progress() timestatus += "Remaining:" + string(time_str) + " | "; } - timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", mem_used, mem_peak); + timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", (double)mem_used, (double)mem_peak); if(status.size() > 0) status = " | " + status; diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 0e44493d674..ac685118b3d 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -52,7 +52,7 @@ public: /* offline render */ void render(); - void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float pixels[]); + void bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, const size_t num_pixels, const int depth, float pixels[]); void write_render_result(BL::RenderResult b_rr, BL::RenderLayer b_rlay, RenderTile& rtile); void write_render_tile(RenderTile& rtile); @@ -73,6 +73,7 @@ public: void get_progress(float& progress, double& total_time); void test_cancel(); void update_status_progress(); + void update_bake_progress(); bool background; Session *session; diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index ddbb40da7db..27c2e9e9ae8 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -53,13 +53,13 @@ void BlenderSync::find_shader(BL::ID id, vector<uint>& used_shaders, int default static BL::NodeSocket get_node_output(BL::Node b_node, const string& name) { BL::Node::outputs_iterator b_out; - + for(b_node.outputs.begin(b_out); b_out != b_node.outputs.end(); ++b_out) if(b_out->name() == name) return *b_out; - + assert(0); - + return *b_out; } @@ -229,7 +229,11 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen BL::ShaderNodeMixRGB b_mix_node(b_node); MixNode *mix = new MixNode(); mix->type = MixNode::type_enum[b_mix_node.blend_type()]; - mix->use_clamp = b_mix_node.use_clamp(); + /* Tag if it's Mix */ + if(b_mix_node.blend_type() == 0) + mix->special_type = SHADER_SPECIAL_TYPE_MIX_RGB; + + mix->use_clamp = b_mix_node.use_clamp(); node = mix; } else if (b_node.is_a(&RNA_ShaderNodeSeparateRGB)) { @@ -244,6 +248,12 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen else if (b_node.is_a(&RNA_ShaderNodeCombineHSV)) { node = new CombineHSVNode(); } + else if (b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) { + node = new SeparateXYZNode(); + } + else if (b_node.is_a(&RNA_ShaderNodeCombineXYZ)) { + node = new CombineXYZNode(); + } else if (b_node.is_a(&RNA_ShaderNodeHueSaturation)) { node = new HSVNode(); } @@ -254,7 +264,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen BL::ShaderNodeMath b_math_node(b_node); MathNode *math = new MathNode(); math->type = MathNode::type_enum[b_math_node.operation()]; - math->use_clamp = b_math_node.use_clamp(); + math->use_clamp = b_math_node.use_clamp(); node = math; } else if (b_node.is_a(&RNA_ShaderNodeVectorMath)) { @@ -274,7 +284,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen else if (b_node.is_a(&RNA_ShaderNodeNormal)) { BL::Node::outputs_iterator out_it; b_node.outputs.begin(out_it); - + NormalNode *norm = new NormalNode(); norm->direction = get_node_output_vector(b_node, "Normal"); node = norm; @@ -282,9 +292,9 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen else if (b_node.is_a(&RNA_ShaderNodeMapping)) { BL::ShaderNodeMapping b_mapping_node(b_node); MappingNode *mapping = new MappingNode(); - + get_tex_mapping(&mapping->tex_mapping, b_mapping_node); - + node = mapping; } else if (b_node.is_a(&RNA_ShaderNodeFresnel)) { @@ -312,7 +322,23 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen node = new HoldoutNode(); } else if (b_node.is_a(&RNA_ShaderNodeBsdfAnisotropic)) { - node = new WardBsdfNode(); + BL::ShaderNodeBsdfAnisotropic b_aniso_node(b_node); + AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode(); + + switch (b_aniso_node.distribution()) + { + case BL::ShaderNodeBsdfAnisotropic::distribution_BECKMANN: + aniso->distribution = ustring("Beckmann"); + break; + case BL::ShaderNodeBsdfAnisotropic::distribution_GGX: + aniso->distribution = ustring("GGX"); + break; + case BL::ShaderNodeBsdfAnisotropic::distribution_ASHIKHMIN_SHIRLEY: + aniso->distribution = ustring("Ashikhmin-Shirley"); + break; + } + + node = aniso; } else if (b_node.is_a(&RNA_ShaderNodeBsdfDiffuse)) { node = new DiffuseBsdfNode(); @@ -347,6 +373,9 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen case BL::ShaderNodeBsdfGlossy::distribution_GGX: glossy->distribution = ustring("GGX"); break; + case BL::ShaderNodeBsdfGlossy::distribution_ASHIKHMIN_SHIRLEY: + glossy->distribution = ustring("Ashikhmin-Shirley"); + break; } node = glossy; } @@ -471,7 +500,7 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen /* create script node */ BL::ShaderNodeScript b_script_node(b_node); OSLScriptNode *script_node = new OSLScriptNode(); - + /* Generate inputs/outputs from node sockets * * Note: the node sockets are generated from OSL parameters, @@ -480,38 +509,38 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen * Note 2: ShaderInput/ShaderOutput store shallow string copies only! * Socket names must be stored in the extra lists instead. */ BL::Node::inputs_iterator b_input; - + for (b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) { script_node->input_names.push_back(ustring(b_input->name())); ShaderInput *input = script_node->add_input(script_node->input_names.back().c_str(), convert_socket_type(*b_input)); set_default_value(input, b_node, *b_input, b_data, b_ntree); } - + BL::Node::outputs_iterator b_output; - + for (b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) { script_node->output_names.push_back(ustring(b_output->name())); script_node->add_output(script_node->output_names.back().c_str(), convert_socket_type(*b_output)); } - + /* load bytecode or filepath */ OSLShaderManager *manager = (OSLShaderManager*)scene->shader_manager; string bytecode_hash = b_script_node.bytecode_hash(); - + if(!bytecode_hash.empty()) { /* loaded bytecode if not already done */ if(!manager->shader_test_loaded(bytecode_hash)) manager->shader_load_bytecode(bytecode_hash, b_script_node.bytecode()); - + script_node->bytecode_hash = bytecode_hash; } else { /* set filepath */ script_node->filepath = blender_absolute_path(b_data, b_ntree, b_script_node.filepath()); } - + node = script_node; } #endif @@ -547,6 +576,13 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen image->animated = b_image_node.image_user().use_auto_refresh(); image->use_alpha = b_image.use_alpha(); + + /* TODO(sergey): Does not work properly when we change builtin type. */ + if (b_image.is_updated()) { + scene->image_manager->tag_reload_image(image->filename, + image->builtin_data, + (InterpolationType)b_image_node.interpolation()); + } } image->color_space = ImageTextureNode::color_space_enum[(int)b_image_node.color_space()]; image->projection = ImageTextureNode::projection_enum[(int)b_image_node.projection()]; @@ -577,6 +613,13 @@ static ShaderNode *add_node(Scene *scene, BL::BlendData b_data, BL::Scene b_scen } env->use_alpha = b_image.use_alpha(); + + /* TODO(sergey): Does not work properly when we change builtin type. */ + if (b_image.is_updated()) { + scene->image_manager->tag_reload_image(env->filename, + env->builtin_data, + INTERPOLATION_LINEAR); + } } env->color_space = EnvironmentTextureNode::color_space_enum[(int)b_env_node.color_space()]; env->projection = EnvironmentTextureNode::projection_enum[(int)b_env_node.projection()]; @@ -689,7 +732,7 @@ static bool node_use_modified_socket_name(ShaderNode *node) { if (node->special_type == SHADER_SPECIAL_TYPE_SCRIPT) return false; - + return true; } @@ -701,57 +744,57 @@ static ShaderInput *node_find_input_by_name(ShaderNode *node, BL::Node b_node, B BL::Node::inputs_iterator b_input; bool found = false; int counter = 0, total = 0; - + for (b_node.inputs.begin(b_input); b_input != b_node.inputs.end(); ++b_input) { if (b_input->name() == name) { if (!found) counter++; total++; } - + if(b_input->ptr.data == b_socket.ptr.data) found = true; } - + /* rename if needed */ if (name == "Shader") name = "Closure"; - + if (total > 1) name = string_printf("%s%d", name.c_str(), counter); } - + return node->input(name.c_str()); } static ShaderOutput *node_find_output_by_name(ShaderNode *node, BL::Node b_node, BL::NodeSocket b_socket) { string name = b_socket.name(); - + if (node_use_modified_socket_name(node)) { BL::Node::outputs_iterator b_output; bool found = false; int counter = 0, total = 0; - + for (b_node.outputs.begin(b_output); b_output != b_node.outputs.end(); ++b_output) { if (b_output->name() == name) { if (!found) counter++; total++; } - + if(b_output->ptr.data == b_socket.ptr.data) found = true; } - + /* rename if needed */ if (name == "Shader") name = "Closure"; - + if (total > 1) name = string_printf("%s%d", name.c_str(), counter); } - + return node->output(name.c_str()); } @@ -762,7 +805,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha BL::ShaderNodeTree::nodes_iterator b_node; PtrInputMap input_map; PtrOutputMap output_map; - + BL::Node::inputs_iterator b_input; BL::Node::outputs_iterator b_output; @@ -792,10 +835,10 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha BL::Node::internal_links_iterator b_link; for (b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) { ProxyNode *proxy = new ProxyNode(convert_socket_type(b_link->to_socket())); - + input_map[b_link->from_socket().ptr.data] = proxy->inputs[0]; output_map[b_link->to_socket().ptr.data] = proxy->outputs[0]; - + graph->add(proxy); } } @@ -807,7 +850,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha else b_group_ntree = BL::ShaderNodeTree(((BL::NodeCustomGroup)(*b_node)).node_tree()); ProxyMap group_proxy_input_map, group_proxy_output_map; - + /* Add a proxy node for each socket * Do this even if the node group has no internal tree, * so that links have something to connect to and assert won't fail. @@ -815,21 +858,21 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) { ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_input)); graph->add(proxy); - + /* register the proxy node for internal binding */ group_proxy_input_map[b_input->identifier()] = proxy; - + input_map[b_input->ptr.data] = proxy->inputs[0]; - + set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree); } for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) { ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_output)); graph->add(proxy); - + /* register the proxy node for internal binding */ group_proxy_output_map[b_output->identifier()] = proxy; - + output_map[b_output->ptr.data] = proxy->outputs[0]; } @@ -842,7 +885,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha ProxyMap::const_iterator proxy_it = proxy_input_map.find(b_output->identifier()); if (proxy_it != proxy_input_map.end()) { ProxyNode *proxy = proxy_it->second; - + output_map[b_output->ptr.data] = proxy->outputs[0]; } } @@ -856,9 +899,9 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha ProxyMap::const_iterator proxy_it = proxy_output_map.find(b_input->identifier()); if (proxy_it != proxy_output_map.end()) { ProxyNode *proxy = proxy_it->second; - + input_map[b_input->ptr.data] = proxy->inputs[0]; - + set_default_value(proxy->inputs[0], *b_node, *b_input, b_data, b_ntree); } } @@ -875,17 +918,25 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha else { node = add_node(scene, b_data, b_scene, graph, b_ntree, BL::ShaderNode(*b_node)); } - + if(node) { /* map node sockets for linking */ for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) { ShaderInput *input = node_find_input_by_name(node, *b_node, *b_input); + if (!input) { + /* XXX should not happen, report error? */ + continue; + } input_map[b_input->ptr.data] = input; - + set_default_value(input, *b_node, *b_input, b_data, b_ntree); } for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) { ShaderOutput *output = node_find_output_by_name(node, *b_node, *b_output); + if (!output) { + /* XXX should not happen, report error? */ + continue; + } output_map[b_output->ptr.data] = output; } } @@ -902,7 +953,7 @@ static void add_nodes(Scene *scene, BL::BlendData b_data, BL::Scene b_scene, Sha ShaderOutput *output = 0; ShaderInput *input = 0; - + PtrOutputMap::iterator output_it = output_map.find(b_from_sock.ptr.data); if (output_it != output_map.end()) output = output_it->second; @@ -934,7 +985,7 @@ void BlenderSync::sync_materials(bool update_all) for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) { Shader *shader; - + /* test if we need to sync */ if(shader_map.sync(&shader, *b_mat) || update_all) { ShaderGraph *graph = new ShaderGraph(); @@ -963,6 +1014,8 @@ void BlenderSync::sync_materials(bool update_all) shader->use_mis = get_boolean(cmat, "sample_as_light"); shader->use_transparent_shadow = get_boolean(cmat, "use_transparent_shadow"); shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume"); + shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cmat, "volume_sampling"); + shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cmat, "volume_interpolation"); shader->set_graph(graph); shader->tag_update(scene); @@ -988,10 +1041,12 @@ void BlenderSync::sync_world(bool update_all) BL::ShaderNodeTree b_ntree(b_world.node_tree()); add_nodes(scene, b_data, b_scene, graph, b_ntree); - + /* volume */ PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles"); shader->heterogeneous_volume = !get_boolean(cworld, "homogeneous_volume"); + shader->volume_sampling_method = (VolumeSampling)RNA_enum_get(&cworld, "volume_sampling"); + shader->volume_interpolation_method = (VolumeInterpolation)RNA_enum_get(&cworld, "volume_interpolation"); } else if(b_world) { ShaderNode *closure, *out; @@ -1022,6 +1077,7 @@ void BlenderSync::sync_world(bool update_all) visibility |= get_boolean(cvisibility, "diffuse")? PATH_RAY_DIFFUSE: 0; visibility |= get_boolean(cvisibility, "glossy")? PATH_RAY_GLOSSY: 0; visibility |= get_boolean(cvisibility, "transmission")? PATH_RAY_TRANSMIT: 0; + visibility |= get_boolean(cvisibility, "scatter")? PATH_RAY_VOLUME_SCATTER: 0; background->visibility = visibility; } @@ -1059,7 +1115,7 @@ void BlenderSync::sync_lamps(bool update_all) for(b_data.lamps.begin(b_lamp); b_lamp != b_data.lamps.end(); ++b_lamp) { Shader *shader; - + /* test if we need to sync */ if(shader_map.sync(&shader, *b_lamp) || update_all) { ShaderGraph *graph = new ShaderGraph(); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 1f5e32a1123..2ac90b34fd7 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -142,8 +142,13 @@ void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, void * sync_film(); sync_shaders(); sync_curve_settings(); + + mesh_synced.clear(); /* use for objects and motion sync */ + sync_objects(b_v3d); sync_motion(b_v3d, b_override, python_thread_state); + + mesh_synced.clear(); } /* Integrator */ @@ -172,14 +177,15 @@ void BlenderSync::sync_integrator() integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces"); integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows"); - integrator->volume_homogeneous_sampling = RNA_enum_get(&cscene, "volume_homogeneous_sampling"); integrator->volume_max_steps = get_int(cscene, "volume_max_steps"); integrator->volume_step_size = get_float(cscene, "volume_step_size"); - integrator->no_caustics = get_boolean(cscene, "no_caustics"); + integrator->caustics_reflective = get_boolean(cscene, "caustics_reflective"); + integrator->caustics_refractive = get_boolean(cscene, "caustics_refractive"); integrator->filter_glossy = get_float(cscene, "blur_glossy"); integrator->seed = get_int(cscene, "seed"); + integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern"); integrator->layer_flag = render_layer.layer; @@ -227,10 +233,6 @@ void BlenderSync::sync_integrator() integrator->subsurface_samples = subsurface_samples; integrator->volume_samples = volume_samples; } - - - if(experimental) - integrator->sampling_pattern = (SamplingPattern)RNA_enum_get(&cscene, "sampling_pattern"); if(integrator->modified(previntegrator)) integrator->tag_update(scene); @@ -312,6 +314,8 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer) BL::RenderSettings::layers_iterator b_rlay; int use_layer_samples = RNA_enum_get(&cscene, "use_layer_samples"); bool first_layer = true; + uint layer_override = get_layer(b_engine.layer_override()); + uint scene_layers = layer_override ? layer_override : get_layer(b_scene.layers()); for(r.layers.begin(b_rlay); b_rlay != r.layers.end(); ++b_rlay) { if((!layer && first_layer) || (layer && b_rlay->name() == layer)) { @@ -320,7 +324,7 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D b_v3d, const char *layer) render_layer.holdout_layer = get_layer(b_rlay->layers_zmask()); render_layer.exclude_layer = get_layer(b_rlay->layers_exclude()); - render_layer.scene_layer = get_layer(b_scene.layers()) & ~render_layer.exclude_layer; + render_layer.scene_layer = scene_layers & ~render_layer.exclude_layer; render_layer.scene_layer |= render_layer.exclude_layer & render_layer.holdout_layer; render_layer.layer = get_layer(b_rlay->layers()); @@ -357,9 +361,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background) const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system"); if(shadingsystem == 0) - params.shadingsystem = SceneParams::SVM; + params.shadingsystem = SHADINGSYSTEM_SVM; else if(shadingsystem == 1) - params.shadingsystem = SceneParams::OSL; + params.shadingsystem = SHADINGSYSTEM_OSL; if(background) params.bvh_type = SceneParams::BVH_STATIC; @@ -369,7 +373,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene b_scene, bool background) params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits"); params.use_bvh_cache = (background)? RNA_boolean_get(&cscene, "use_cache"): false; - if(background && params.shadingsystem != SceneParams::OSL) + if(background && params.shadingsystem != SHADINGSYSTEM_OSL) params.persistent_data = r.use_persistent_data(); else params.persistent_data = false; @@ -506,9 +510,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system"); if(shadingsystem == 0) - params.shadingsystem = SessionParams::SVM; + params.shadingsystem = SHADINGSYSTEM_SVM; else if(shadingsystem == 1) - params.shadingsystem = SessionParams::OSL; + params.shadingsystem = SHADINGSYSTEM_OSL; /* color managagement */ params.display_buffer_linear = GLEW_ARB_half_float_pixel && b_engine.support_display_space_shader(b_scene); diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 3c0c5c021c8..15bd814b8d5 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -103,18 +103,30 @@ bool BVH::cache_read(CacheData& key) if(Cache::global.lookup(key, value)) { cache_filename = key.get_filename(); - value.read(pack.root_index); - value.read(pack.SAH); - - value.read(pack.nodes); - value.read(pack.object_node); - value.read(pack.tri_woop); - value.read(pack.prim_type); - value.read(pack.prim_visibility); - value.read(pack.prim_index); - value.read(pack.prim_object); - value.read(pack.is_leaf); - + if(!(value.read(pack.root_index) && + value.read(pack.SAH) && + value.read(pack.nodes) && + value.read(pack.object_node) && + value.read(pack.tri_woop) && + value.read(pack.prim_type) && + value.read(pack.prim_visibility) && + value.read(pack.prim_index) && + value.read(pack.prim_object) && + value.read(pack.is_leaf))) + { + /* Clear the pack if load failed. */ + pack.root_index = 0; + pack.SAH = 0.0f; + pack.nodes.clear(); + pack.object_node.clear(); + pack.tri_woop.clear(); + pack.prim_type.clear(); + pack.prim_visibility.clear(); + pack.prim_index.clear(); + pack.prim_object.clear(); + pack.is_leaf.clear(); + return false; + } return true; } diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index ed67690a07f..e073b69472e 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -123,7 +123,7 @@ protected: /* BVH Range * * Build range used during construction, to indicate the bounds and place in - * the reference array of a subset of pirmitives Again uses trickery to pack + * the reference array of a subset of primitives Again uses trickery to pack * integers into BoundBox for alignment purposes. */ class BVHRange diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 8753ff4bf84..4f02b93f04a 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -1,4 +1,3 @@ - ########################################################################### # GLUT @@ -8,13 +7,17 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) find_package(GLUT) message(STATUS "GLUT_FOUND=${GLUT_FOUND}") - include_directories(${GLUT_INCLUDE_DIR}) + include_directories( + SYSTEM + ${GLUT_INCLUDE_DIR} + ) endif() -if(WITH_SYSTEM_GLEW) - set(CYCLES_GLEW_LIBRARY ${GLEW_LIBRARY}) -else() - set(CYCLES_GLEW_LIBRARY extern_glew) +########################################################################### +# GLEW + +if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) + set(CYCLES_APP_GLEW_LIBRARY ${BLENDER_GLEW_LIBRARIES}) endif() ########################################################################### @@ -29,4 +32,3 @@ if(WITH_CYCLES_CUDA_BINARIES) set(WITH_CYCLES_CUDA_BINARIES OFF) endif() endif() - diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index ae3309df3d9..998b35351e3 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -6,11 +6,13 @@ set(INC ../kernel/osl ../util ../render + ../../glew-mx ) set(INC_SYS - ${OPENGL_INCLUDE_DIR} ${GLEW_INCLUDE_PATH} + ../../../extern/cuew/include + ../../../extern/clew/include ) set(SRC @@ -36,7 +38,7 @@ set(SRC_HEADERS device_task.h ) -add_definitions(-DGLEW_STATIC) +add_definitions(${GL_DEFINITIONS}) include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 7fd1b79f6bc..efdfa98cfb5 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -20,12 +20,13 @@ #include "device.h" #include "device_intern.h" -#include "util_cuda.h" +#include "cuew.h" +#include "clew.h" + #include "util_debug.h" #include "util_foreach.h" #include "util_half.h" #include "util_math.h" -#include "util_opencl.h" #include "util_opengl.h" #include "util_time.h" #include "util_types.h" @@ -66,7 +67,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w glColor3f(1.0f, 1.0f, 1.0f); if(rgba.data_type == TYPE_HALF) { - /* for multi devices, this assumes the ineffecient method that we allocate + /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ GLhalf *data_pointer = (GLhalf*)rgba.data_pointer; data_pointer += 4*y*w; @@ -141,7 +142,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background) break; #ifdef WITH_CUDA case DEVICE_CUDA: - if(cuLibraryInit()) + if(device_cuda_init()) device = device_cuda_create(info, stats, background); else device = NULL; @@ -159,7 +160,7 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background) #endif #ifdef WITH_OPENCL case DEVICE_OPENCL: - if(clLibraryInit()) + if(device_opencl_init()) device = device_opencl_create(info, stats, background); else device = NULL; @@ -213,12 +214,12 @@ vector<DeviceType>& Device::available_types() types.push_back(DEVICE_CPU); #ifdef WITH_CUDA - if(cuLibraryInit()) + if(device_cuda_init()) types.push_back(DEVICE_CUDA); #endif #ifdef WITH_OPENCL - if(clLibraryInit()) + if(device_opencl_init()) types.push_back(DEVICE_OPENCL); #endif @@ -242,12 +243,12 @@ vector<DeviceInfo>& Device::available_devices() if(!devices_init) { #ifdef WITH_CUDA - if(cuLibraryInit()) + if(device_cuda_init()) device_cuda_info(devices); #endif #ifdef WITH_OPENCL - if(clLibraryInit()) + if(device_opencl_init()) device_opencl_info(devices); #endif diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index bcddd4f73e2..20ebfd391d6 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -122,6 +122,7 @@ public: virtual bool load_kernels(bool experimental) { return true; } /* tasks */ + virtual int get_split_task_count(DeviceTask& task) = 0; virtual void task_add(DeviceTask& task) = 0; virtual void task_wait() = 0; virtual void task_cancel() = 0; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c9cc7592028..c9b8a5b726b 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -17,6 +17,11 @@ #include <stdlib.h> #include <string.h> +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +# include <OSL/oslexec.h> +#endif + #include "device.h" #include "device_intern.h" @@ -62,6 +67,7 @@ public: system_cpu_support_sse3(); system_cpu_support_sse41(); system_cpu_support_avx(); + system_cpu_support_avx2(); } ~CPUDevice() @@ -72,8 +78,8 @@ public: void mem_alloc(device_memory& mem, MemoryType type) { mem.device_pointer = mem.data_pointer; - - stats.mem_alloc(mem.memory_size()); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); } void mem_copy_to(device_memory& mem) @@ -93,9 +99,11 @@ public: void mem_free(device_memory& mem) { - mem.device_pointer = 0; - - stats.mem_free(mem.memory_size()); + if(mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } } void const_copy_to(const char *name, void *host, size_t size) @@ -107,15 +115,17 @@ public: { kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height, mem.data_depth, interpolation); mem.device_pointer = mem.data_pointer; - - stats.mem_alloc(mem.memory_size()); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); } void tex_free(device_memory& mem) { - mem.device_pointer = 0; - - stats.mem_free(mem.memory_size()); + if(mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } } void *osl_memory() @@ -167,6 +177,28 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + kernel_cpu_avx2_path_trace(&kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int sample = start_sample; sample < end_sample; sample++) { @@ -184,7 +216,7 @@ public: tile.sample = sample + 1; - task.update_progress(tile); + task.update_progress(&tile); } } else @@ -206,7 +238,7 @@ public: tile.sample = sample + 1; - task.update_progress(tile); + task.update_progress(&tile); } } else @@ -228,7 +260,7 @@ public: tile.sample = sample + 1; - task.update_progress(tile); + task.update_progress(&tile); } } else @@ -250,7 +282,7 @@ public: tile.sample = sample + 1; - task.update_progress(tile); + task.update_progress(&tile); } } else @@ -271,7 +303,7 @@ public: tile.sample = sample + 1; - task.update_progress(tile); + task.update_progress(&tile); } } @@ -293,6 +325,15 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -337,6 +378,15 @@ public: } } else { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_avx2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int y = task.y; y < task.y + task.h; y++) @@ -390,56 +440,91 @@ public: OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); + + if(task.get_cancel() || task_pool.canceled()) + break; + + task.update_progress(NULL); + } + } + else +#endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; + + task.update_progress(NULL); } } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; + + task.update_progress(NULL); } } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; + + task.update_progress(NULL); } } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; + + task.update_progress(NULL); } } else #endif { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) { + for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, + task.shader_eval_type, x, task.offset, sample); if(task.get_cancel() || task_pool.canceled()) break; + + task.update_progress(NULL); } } @@ -448,11 +533,23 @@ public: #endif } + int get_split_task_count(DeviceTask& task) + { + if (task.type == DeviceTask::SHADER) + return task.get_subtask_count(TaskScheduler::num_threads(), 256); + else + return task.get_subtask_count(TaskScheduler::num_threads()); + } + void task_add(DeviceTask& task) { /* split task into smaller ones */ list<DeviceTask> tasks; - task.split(tasks, TaskScheduler::num_threads()); + + if(task.type == DeviceTask::SHADER) + task.split(tasks, TaskScheduler::num_threads(), 256); + else + task.split(tasks, TaskScheduler::num_threads()); foreach(DeviceTask& task, tasks) task_pool.push(new CPUDeviceTask(this, task)); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 68955211146..844fb3b8d50 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -23,7 +23,7 @@ #include "buffers.h" -#include "util_cuda.h" +#include "cuew.h" #include "util_debug.h" #include "util_map.h" #include "util_opengl.h" @@ -41,14 +41,11 @@ public: CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; - CUstream cuStream; - CUevent tileDone; map<device_ptr, bool> tex_interp_map; int cuDevId; int cuDevArchitecture; bool first_error; bool use_texture_storage; - unsigned int target_update_frequency; struct PixelMem { GLuint cuPBO; @@ -64,53 +61,10 @@ public: return (CUdeviceptr)mem; } - static const char *cuda_error_string(CUresult result) + static bool have_precompiled_kernels() { - switch(result) { - case CUDA_SUCCESS: return "No errors"; - case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; - case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; - case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized"; - case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized"; - - case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available"; - case CUDA_ERROR_INVALID_DEVICE: return "Invalid device"; - - case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image"; - case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context"; - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current"; - case CUDA_ERROR_MAP_FAILED: return "Map failed"; - case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed"; - case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped"; - case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped"; - case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU"; - case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired"; - case CUDA_ERROR_NOT_MAPPED: return "Not mapped"; - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array"; - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer"; - case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected"; - case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device"; - - case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; - case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve"; - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed"; - - case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle"; - - case CUDA_ERROR_NOT_FOUND: return "Not found"; - - case CUDA_ERROR_NOT_READY: return "CUDA not ready"; - - case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources"; - case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout"; - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing"; - - case CUDA_ERROR_UNKNOWN: return "Unknown error"; - - default: return "Unknown CUDA error value"; - } + string cubins_path = path_get("lib"); + return path_exists(cubins_path); } /*#ifdef NDEBUG @@ -132,7 +86,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -146,7 +100,7 @@ public: if(result == CUDA_SUCCESS) return false; - string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuda_error_string(result)); + string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); if(error_msg == "") error_msg = message; fprintf(stderr, "%s\n", message.c_str()); @@ -180,8 +134,6 @@ public: first_error = true; background = background_; use_texture_storage = true; - /* we try an update / sync every 1000 ms */ - target_update_frequency = 1000; cuDevId = info.num; cuDevice = 0; @@ -212,9 +164,6 @@ public: if(cuda_error_(result, "cuCtxCreate")) return; - cuda_assert(cuStreamCreate(&cuStream, 0)); - cuda_assert(cuEventCreate(&tileDone, 0x1)); - int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevId); cuDevArchitecture = major*100 + minor*10; @@ -231,12 +180,10 @@ public: { task_pool.stop(); - cuda_assert(cuEventDestroy(tileDone)); - cuda_assert(cuStreamDestroy(cuStream)); cuda_assert(cuCtxDestroy(cuContext)); } - bool support_device(bool experimental, bool branched) + bool support_device(bool experimental) { int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevId); @@ -250,14 +197,22 @@ public: return true; } - string compile_kernel() + string compile_kernel(bool experimental) { /* compute cubin name */ int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevId); + + /* workaround to make sm_52 cards work, until we bundle kernel */ + if(major == 5 && minor == 2) + minor = 0; /* attempt to use kernel provided with blender */ - string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor)); + string cubin; + if(experimental) + cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor)); + else + cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor)); if(path_exists(cubin)) return cubin; @@ -265,7 +220,10 @@ public: string kernel_path = path_get("kernel"); string md5 = path_files_md5_hash(kernel_path); - cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str()); + if(experimental) + cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str()); + else + cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str()); cubin = path_user_get(path_join("cache", cubin)); /* if exists already, use it */ @@ -273,7 +231,7 @@ public: return cubin; #ifdef _WIN32 - if(cuHavePrecompiledKernels()) { + if(have_precompiled_kernels()) { if(major < 2) cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor)); else @@ -283,25 +241,25 @@ public: #endif /* if not, find CUDA compiler */ - string nvcc = cuCompilerPath(); + const char *nvcc = cuewCompilerPath(); - if(nvcc == "") { + if(nvcc == NULL) { cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location."); return ""; } - int cuda_version = cuCompilerVersion(); + int cuda_version = cuewCompilerVersion(); if(cuda_version == 0) { cuda_error_message("CUDA nvcc compiler version could not be parsed."); return ""; } - if(cuda_version < 50) { - printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.0.\n", cuda_version/10, cuda_version%10); + if(cuda_version < 60) { + printf("Unsupported CUDA version %d.%d detected, you need CUDA 6.5.\n", cuda_version/10, cuda_version%10); return ""; } - else if(cuda_version != 60) - printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported.\n", cuda_version/10, cuda_version%10); + else if(cuda_version != 65) + printf("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported.\n", cuda_version/10, cuda_version%10); /* compile */ string kernel = path_join(kernel_path, "kernel.cu"); @@ -315,7 +273,14 @@ public: string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" " "-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d", - nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version); + nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version); + + if(experimental) + command += " -D__KERNEL_CUDA_EXPERIMENTAL__"; + +#ifdef WITH_CYCLES_DEBUG + command += " -D__KERNEL_DEBUG__"; +#endif printf("%s\n", command.c_str()); @@ -342,11 +307,11 @@ public: return false; /* check if GPU is supported */ - if(!support_device(experimental, false)) + if(!support_device(experimental)) return false; /* get kernel */ - string cubin = compile_kernel(); + string cubin = compile_kernel(experimental); if(cubin == "") return false; @@ -377,6 +342,7 @@ public: size_t size = mem.memory_size(); cuda_assert(cuMemAlloc(&device_pointer, size)); mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; stats.mem_alloc(size); cuda_pop_context(); } @@ -397,7 +363,7 @@ public: cuda_push_context(); if(mem.device_pointer) { cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, - (CUdeviceptr)((uchar*)mem.device_pointer + offset), size)); + (CUdeviceptr)(mem.device_pointer + offset), size)); } else { memset((char*)mem.data_pointer + offset, 0, size); @@ -424,7 +390,8 @@ public: mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); + stats.mem_free(mem.device_size); + mem.device_size = 0; } } @@ -516,6 +483,7 @@ public: cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); mem.device_pointer = (device_ptr)handle; + mem.device_size = size; stats.mem_alloc(size); } @@ -583,7 +551,8 @@ public: tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); + stats.mem_free(mem.device_size); + mem.device_size = 0; } else { tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); @@ -604,7 +573,7 @@ public: CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); /* get kernel function */ - if(branched && support_device(true, branched)) { + if(branched) { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); } else { @@ -613,40 +582,17 @@ public: if(have_error()) return; - - /* pass in parameters */ - int offset = 0; - - cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer))); - offset += sizeof(d_buffer); - - cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state))); - offset += sizeof(d_rng_state); - - offset = align_up(offset, __alignof(sample)); - - cuda_assert(cuParamSeti(cuPathTrace, offset, sample)); - offset += sizeof(sample); - - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.x)); - offset += sizeof(rtile.x); - - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.y)); - offset += sizeof(rtile.y); - - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.w)); - offset += sizeof(rtile.w); - - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.h)); - offset += sizeof(rtile.h); - - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.offset)); - offset += sizeof(rtile.offset); - cuda_assert(cuParamSeti(cuPathTrace, offset, rtile.stride)); - offset += sizeof(rtile.stride); - - cuda_assert(cuParamSetSize(cuPathTrace, offset)); + /* pass in parameters */ + void *args[] = {&d_buffer, + &d_rng_state, + &sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride}; /* launch kernel */ int threads_per_block; @@ -664,16 +610,13 @@ public: int yblocks = (rtile.h + ythreads - 1)/ythreads; cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)); - if(info.display_device) { - /* don't use async for device used for display, locks up UI too much */ - cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)); - cuda_assert(cuCtxSynchronize()); - } - else { - cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream)); - } + cuda_assert(cuLaunchKernel(cuPathTrace, + xblocks , yblocks, 1, /* blocks */ + xthreads, ythreads, 1, /* threads */ + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); cuda_pop_context(); } @@ -697,40 +640,19 @@ public: cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); } - /* pass in parameters */ - int offset = 0; - - cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba))); - offset += sizeof(d_rgba); - - cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer))); - offset += sizeof(d_buffer); float sample_scale = 1.0f/(task.sample + 1); - offset = align_up(offset, __alignof(sample_scale)); - - cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale)); - offset += sizeof(sample_scale); - - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x)); - offset += sizeof(task.x); - - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y)); - offset += sizeof(task.y); - - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w)); - offset += sizeof(task.w); - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h)); - offset += sizeof(task.h); - - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset)); - offset += sizeof(task.offset); - - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride)); - offset += sizeof(task.stride); - - cuda_assert(cuParamSetSize(cuFilmConvert, offset)); + /* pass in parameters */ + void *args[] = {&d_rgba, + &d_buffer, + &sample_scale, + &task.x, + &task.y, + &task.w, + &task.h, + &task.offset, + &task.stride}; /* launch kernel */ int threads_per_block; @@ -742,8 +664,11 @@ public: int yblocks = (task.h + ythreads - 1)/ythreads; cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1)); - cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks)); + + cuda_assert(cuLaunchKernel(cuFilmConvert, + xblocks , yblocks, 1, /* blocks */ + xthreads, ythreads, 1, /* threads */ + 0, 0, args, 0)); unmap_pixels((rgba_byte)? rgba_byte: rgba_half); @@ -762,49 +687,54 @@ public: CUdeviceptr d_output = cuda_device_ptr(task.shader_output); /* get kernel function */ - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); + } + else { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + } /* do tasks in smaller chunks, so we can cancel it */ const int shader_chunk_size = 65536; const int start = task.shader_x; const int end = task.shader_x + task.shader_w; + int offset = task.offset; + + bool canceled = false; + for(int sample = 0; sample < task.num_samples && !canceled; sample++) { + for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { + int shader_w = min(shader_chunk_size, end - shader_x); + + /* pass in parameters */ + void *args[] = {&d_input, + &d_output, + &task.shader_eval_type, + &shader_x, + &shader_w, + &offset, + &sample}; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); + + int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuLaunchKernel(cuShader, + xblocks , 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + if(task.get_cancel()) { + canceled = false; + break; + } + } - for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - if(task.get_cancel()) - break; - - /* pass in parameters */ - int offset = 0; - - cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input))); - offset += sizeof(d_input); - - cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output))); - offset += sizeof(d_output); - - int shader_eval_type = task.shader_eval_type; - offset = align_up(offset, __alignof(shader_eval_type)); - - cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type)); - offset += sizeof(task.shader_eval_type); - - cuda_assert(cuParamSeti(cuShader, offset, shader_x)); - offset += sizeof(shader_x); - - cuda_assert(cuParamSetSize(cuShader, offset)); - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int shader_w = min(shader_chunk_size, end - shader_x); - int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1)); - cuda_assert(cuLaunchGrid(cuShader, xblocks, 1)); - - cuda_assert(cuCtxSynchronize()); + task.update_progress(NULL); } cuda_pop_context(); @@ -872,7 +802,8 @@ public: mem.device_pointer = pmem.cuTexId; pixel_mem_map[mem.device_pointer] = pmem; - stats.mem_alloc(mem.memory_size()); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); return; } @@ -929,7 +860,8 @@ public: pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); + stats.mem_free(mem.device_size); + mem.device_size = 0; return; } @@ -946,7 +878,7 @@ public: cuda_push_context(); - /* for multi devices, this assumes the ineffecient method that we allocate + /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ size_t offset = 4*y*w; @@ -1024,10 +956,6 @@ public: int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; - boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time()); - boost::posix_time::ptime last_time = start_time; - int sync_sample = 10; - for(int sample = start_sample; sample < end_sample; sample++) { if (task->get_cancel()) { if(task->need_finish_queue == false) @@ -1037,28 +965,8 @@ public: path_trace(tile, sample, branched); tile.sample = sample + 1; - task->update_progress(tile); - if(!info.display_device && sample == sync_sample) { - cuda_push_context(); - cuda_assert(cuEventRecord(tileDone, cuStream)); - cuda_assert(cuEventSynchronize(tileDone)); - - /* Do some time keeping to find out if we need to sync less */ - boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time()); - boost::posix_time::time_duration sample_duration = current_time - last_time; - - long msec = sample_duration.total_milliseconds(); - float scaling_factor = (float)target_update_frequency / (float)msec; - - /* sync at earliest next sample and probably later */ - sync_sample = (sample + 1) + sync_sample * (int)ceil(scaling_factor); - - sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always - - last_time = current_time; - cuda_pop_context(); - } + task->update_progress(&tile); } task->release_tile(tile); @@ -1082,6 +990,11 @@ public: } }; + int get_split_task_count(DeviceTask& task) + { + return 1; + } + void task_add(DeviceTask& task) { if(task.type == DeviceTask::FILM_CONVERT) { @@ -1108,6 +1021,28 @@ public: } }; +bool device_cuda_init(void) +{ + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + + if (cuewInit() == CUEW_SUCCESS) { + if(CUDADevice::have_precompiled_kernels()) + result = true; +#ifndef _WIN32 + else if(cuewCompilerPath() != NULL) + result = true; +#endif + } + + return result; +} + Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background) { return new CUDADevice(info, stats, background); @@ -1121,13 +1056,13 @@ void device_cuda_info(vector<DeviceInfo>& devices) result = cuInit(0); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) - fprintf(stderr, "CUDA cuInit: %s\n", CUDADevice::cuda_error_string(result)); + fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); return; } result = cuDeviceGetCount(&count); if(result != CUDA_SUCCESS) { - fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", CUDADevice::cuda_error_string(result)); + fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); return; } diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h index 7eb66c25a81..80f1e2441a5 100644 --- a/intern/cycles/device/device_intern.h +++ b/intern/cycles/device/device_intern.h @@ -22,7 +22,9 @@ CCL_NAMESPACE_BEGIN class Device; Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background); +bool device_opencl_init(void); Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background); +bool device_cuda_init(void); Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background); Device *device_network_create(DeviceInfo& info, Stats &stats, const char *address); Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background); diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 8d6f4a49a9c..07a6eb36a3c 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -167,6 +167,7 @@ public: int data_elements; device_ptr data_pointer; size_t data_size; + size_t device_size; size_t data_width; size_t data_height; size_t data_depth; @@ -194,6 +195,7 @@ public: data_elements = device_type_traits<T>::num_elements; data_pointer = 0; data_size = 0; + device_size = 0; data_width = 0; data_height = 0; data_depth = 0; @@ -258,6 +260,11 @@ public: return data.size(); } + T* get_data() + { + return &data[0]; + } + private: array<T> data; }; diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index c866ebaaea2..7f055c79491 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -278,6 +278,22 @@ public: return -1; } + int get_split_task_count(DeviceTask& task) + { + int total_tasks = 0; + list<DeviceTask> tasks; + task.split(tasks, devices.size()); + foreach(SubDevice& sub, devices) { + if(!tasks.empty()) { + DeviceTask subtask = tasks.front(); + tasks.pop_front(); + + total_tasks += sub.device->get_split_task_count(subtask); + } + } + return total_tasks; + } + void task_add(DeviceTask& task) { list<DeviceTask> tasks; diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index af051076009..dca9bf29e70 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -299,6 +299,11 @@ public: snd.write(); } + int get_split_task_count(DeviceTask& task) + { + return 1; + } + private: NetworkError error_func; }; diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 694ec9db036..58b2bcafb82 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -25,11 +25,12 @@ #include "buffers.h" +#include "clew.h" + #include "util_foreach.h" #include "util_map.h" #include "util_math.h" #include "util_md5.h" -#include "util_opencl.h" #include "util_opengl.h" #include "util_path.h" #include "util_time.h" @@ -101,7 +102,11 @@ static string opencl_kernel_build_options(const string& platform, const string * if(opencl_kernel_use_debug()) build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - + +#ifdef WITH_CYCLES_DEBUG + build_options += "-D__KERNEL_DEBUG__ "; +#endif + return build_options; } @@ -321,6 +326,7 @@ public: cl_kernel ckFilmConvertByteKernel; cl_kernel ckFilmConvertHalfFloatKernel; cl_kernel ckShaderKernel; + cl_kernel ckBakeKernel; cl_int ciErr; typedef map<string, device_vector<uchar>*> ConstMemMap; @@ -333,63 +339,10 @@ public: bool device_initialized; string platform_name; - const char *opencl_error_string(cl_int err) - { - switch (err) { - case CL_SUCCESS: return "Success!"; - case CL_DEVICE_NOT_FOUND: return "Device not found."; - case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; - case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; - case CL_OUT_OF_RESOURCES: return "Out of resources"; - case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; - case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; - case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; - case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; - case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; - case CL_MAP_FAILURE: return "Map failure"; - case CL_INVALID_VALUE: return "Invalid value"; - case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; - case CL_INVALID_PLATFORM: return "Invalid platform"; - case CL_INVALID_DEVICE: return "Invalid device"; - case CL_INVALID_CONTEXT: return "Invalid context"; - case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; - case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; - case CL_INVALID_HOST_PTR: return "Invalid host pointer"; - case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; - case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; - case CL_INVALID_SAMPLER: return "Invalid sampler"; - case CL_INVALID_BINARY: return "Invalid binary"; - case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; - case CL_INVALID_PROGRAM: return "Invalid program"; - case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; - case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; - case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; - case CL_INVALID_KERNEL: return "Invalid kernel"; - case CL_INVALID_ARG_INDEX: return "Invalid argument index"; - case CL_INVALID_ARG_VALUE: return "Invalid argument value"; - case CL_INVALID_ARG_SIZE: return "Invalid argument size"; - case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; - case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; - case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; - case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; - case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; - case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; - case CL_INVALID_EVENT: return "Invalid event"; - case CL_INVALID_OPERATION: return "Invalid operation"; - case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; - case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; - case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; - default: return "Unknown"; - } - } - bool opencl_error(cl_int err) { if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err)); + string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); if(error_msg == "") error_msg = message; fprintf(stderr, "%s\n", message.c_str()); @@ -411,7 +364,7 @@ public: cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", opencl_error_string(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -421,7 +374,7 @@ public: void opencl_assert_err(cl_int err, const char* where) { if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s in %s", err, opencl_error_string(err), where); + string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); if(error_msg == "") error_msg = message; fprintf(stderr, "%s\n", message.c_str()); @@ -443,6 +396,7 @@ public: ckFilmConvertByteKernel = NULL; ckFilmConvertHalfFloatKernel = NULL; ckShaderKernel = NULL; + ckBakeKernel = NULL; null_mem = 0; device_initialized = false; @@ -550,7 +504,7 @@ public: device_initialized = true; } - static void context_notify_callback(const char *err_info, + static void CL_CALLBACK context_notify_callback(const char *err_info, const void *private_info, size_t cb, void *user_data) { char name[256]; @@ -791,6 +745,10 @@ public: if(opencl_error(ciErr)) return false; + ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr); + if(opencl_error(ciErr)) + return false; + return true; } @@ -840,6 +798,7 @@ public: opencl_assert_err(ciErr, "clCreateBuffer"); stats.mem_alloc(size); + mem.device_size = size; } void mem_copy_to(device_memory& mem) @@ -871,7 +830,8 @@ public: opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); mem.device_pointer = 0; - stats.mem_free(mem.memory_size()); + stats.mem_free(mem.device_size); + mem.device_size = 0; } } @@ -1050,23 +1010,43 @@ public: cl_int d_shader_eval_type = task.shader_eval_type; cl_int d_shader_x = task.shader_x; cl_int d_shader_w = task.shader_w; + cl_int d_offset = task.offset; /* sample arguments */ cl_uint narg = 0; - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_input), (void*)&d_input)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_output), (void*)&d_output)); + cl_kernel kernel; + + if(task.shader_eval_type >= SHADER_EVAL_BAKE) + kernel = ckBakeKernel; + else + kernel = ckShaderKernel; + + for(int sample = 0; sample < task.num_samples; sample++) { + + if(task.get_cancel()) + break; + + cl_int d_sample = sample; + + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output)); #define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckShaderKernel, &narg, #name); + set_kernel_arg_mem(kernel, &narg, #name); #include "kernel_textures.h" - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); - opencl_assert(clSetKernelArg(ckShaderKernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample)); + + enqueue_kernel(kernel, task.shader_w, 1); - enqueue_kernel(ckShaderKernel, task.shader_w, 1); + task.update_progress(NULL); + } } void thread_run(DeviceTask *task) @@ -1095,7 +1075,7 @@ public: tile.sample = sample + 1; - task->update_progress(tile); + task->update_progress(&tile); } task->release_tile(tile); @@ -1112,6 +1092,11 @@ public: } }; + int get_split_task_count(DeviceTask& task) + { + return 1; + } + void task_add(DeviceTask& task) { task_pool.push(new OpenCLDeviceTask(this, task)); @@ -1133,6 +1118,26 @@ Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background) return new OpenCLDevice(info, stats, background); } +bool device_opencl_init(void) { + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + + // OpenCL disabled for now, only works with this environment variable set + if(!getenv("CYCLES_OPENCL_TEST")) { + result = false; + } + else { + result = clewInit() == CLEW_SUCCESS; + } + + return result; +} + void device_opencl_info(vector<DeviceInfo>& devices) { vector<cl_device_id> device_ids; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 7d0eeab780d..dc124f8cf37 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -35,26 +35,39 @@ DeviceTask::DeviceTask(Type type_) last_update_time = time_dt(); } -void DeviceTask::split_max_size(list<DeviceTask>& tasks, int max_size) +int DeviceTask::get_subtask_count(int num, int max_size) { - int num; + if(max_size != 0) { + int max_size_num; + + if(type == SHADER) { + max_size_num = (shader_w + max_size - 1)/max_size; + } + else { + max_size = max(1, max_size/w); + max_size_num = (h + max_size - 1)/max_size; + } + + num = max(max_size_num, num); + } if(type == SHADER) { - num = (shader_w + max_size - 1)/max_size; + num = min(shader_w, num); + } + else if(type == PATH_TRACE) { } else { - max_size = max(1, max_size/w); - num = (h + max_size - 1)/max_size; + num = min(h, num); } - split(tasks, num); + return num; } -void DeviceTask::split(list<DeviceTask>& tasks, int num) +void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) { - if(type == SHADER) { - num = min(shader_w, num); + num = get_subtask_count(num, max_size); + if(type == SHADER) { for(int i = 0; i < num; i++) { int tx = shader_x + (shader_w/num)*i; int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num; @@ -72,8 +85,6 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num) tasks.push_back(*this); } else { - num = min(h, num); - for(int i = 0; i < num; i++) { int ty = y + (h/num)*i; int th = (i == num-1)? h - i*(h/num): h/num; @@ -88,9 +99,10 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num) } } -void DeviceTask::update_progress(RenderTile &rtile) +void DeviceTask::update_progress(RenderTile *rtile) { - if (type != PATH_TRACE) + if((type != PATH_TRACE) && + (type != SHADER)) return; if(update_progress_sample) @@ -100,7 +112,7 @@ void DeviceTask::update_progress(RenderTile &rtile) double current_time = time_dt(); if (current_time - last_update_time >= 1.0) { - update_tile_sample(rtile); + update_tile_sample(*rtile); last_update_time = current_time; } diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index c1bd39b70ca..50216adefe2 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -52,10 +52,10 @@ public: DeviceTask(Type type = PATH_TRACE); - void split(list<DeviceTask>& tasks, int num); - void split_max_size(list<DeviceTask>& tasks, int max_size); + int get_subtask_count(int num, int max_size = 0); + void split(list<DeviceTask>& tasks, int num, int max_size = 0); - void update_progress(RenderTile &rtile); + void update_progress(RenderTile *rtile); boost::function<bool(Device *device, RenderTile&)> acquire_tile; boost::function<void(void)> update_progress_sample; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index d18f4fa2998..c521e1383a4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -19,12 +19,13 @@ set(SRC set(SRC_HEADERS kernel.h kernel_accumulate.h + kernel_bake.h kernel_camera.h kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h + kernel_debug.h kernel_differential.h - kernel_displace.h kernel_emission.h kernel_film.h kernel_globals.h @@ -35,6 +36,8 @@ set(SRC_HEADERS kernel_passes.h kernel_path.h kernel_path_state.h + kernel_path_surface.h + kernel_path_volume.h kernel_projection.h kernel_random.h kernel_shader.h @@ -58,8 +61,7 @@ set(SRC_CLOSURE_HEADERS closure/bsdf_toon.h closure/bsdf_transparent.h closure/bsdf_util.h - closure/bsdf_ward.h - closure/bsdf_westin.h + closure/bsdf_ashikhmin_shirley.h closure/bsdf_hair.h closure/bssrdf.h closure/emissive.h @@ -95,8 +97,8 @@ set(SRC_SVM_HEADERS svm/svm_noisetex.h svm/svm_normal.h svm/svm_ramp.h - svm/svm_sepcomb_rgb.h svm/svm_sepcomb_hsv.h + svm/svm_sepcomb_vector.h svm/svm_sky.h svm/svm_tex_coord.h svm/svm_texture.h @@ -111,8 +113,10 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_bvh.h + geom/geom_bvh_shadow.h geom/geom_bvh_subsurface.h geom/geom_bvh_traversal.h + geom/geom_bvh_volume.h geom/geom_curve.h geom/geom_motion_curve.h geom/geom_motion_triangle.h @@ -146,50 +150,69 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "60") + if(CUDA_VERSION MATCHES "65") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " - "build may succeed but only CUDA 6.0 is officially supported") + "build may succeed but only CUDA 6.5 is officially supported") endif() # build for each arch set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) set(cuda_cubins) - foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) - set(cuda_cubin kernel_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch experimental) + if(${experimental}) + set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__") + set(cuda_cubin kernel_experimental_${arch}.cubin) + else() + set(cuda_extra_flags "") + set(cuda_cubin kernel_${arch}.cubin) + endif() + + if(WITH_CYCLES_DEBUG) + set(cuda_debug_flags "-D__KERNEL_DEBUG__") + else() + set(cuda_debug_flags "") + endif() set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}") set(cuda_math_flags "--use_fast_math") - if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50") - message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping") - elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35") - message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping") - else() - add_custom_command( - OUTPUT ${cuda_cubin} - COMMAND ${CUDA_NVCC_EXECUTABLE} - -arch=${arch} - -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} - --ptxas-options="-v" - ${cuda_arch_flags} - ${cuda_version_flags} - ${cuda_math_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm - -DCCL_NAMESPACE_BEGIN= - -DCCL_NAMESPACE_END= - -DNVCC - - DEPENDS ${cuda_sources}) - - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND cuda_cubins ${cuda_cubin}) - endif() + add_custom_command( + OUTPUT ${cuda_cubin} + COMMAND ${CUDA_NVCC_EXECUTABLE} + -arch=${arch} + -m${CUDA_BITS} + --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} + --ptxas-options="-v" + ${cuda_arch_flags} + ${cuda_version_flags} + ${cuda_math_flags} + ${cuda_extra_flags} + ${cuda_debug_flags} + -I${CMAKE_CURRENT_SOURCE_DIR}/../util + -I${CMAKE_CURRENT_SOURCE_DIR}/svm + -DCCL_NAMESPACE_BEGIN= + -DCCL_NAMESPACE_END= + -DNVCC + + DEPENDS ${cuda_sources}) + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND cuda_cubins ${cuda_cubin}) + + unset(cuda_extra_flags) + unset(cuda_debug_flags) + endmacro() + + foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) + # Compile regular kernel + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + + # Compile experimental kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE) endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -213,12 +236,14 @@ if(CXX_HAS_SSE) kernel_sse3.cpp kernel_sse41.cpp kernel_avx.cpp + kernel_avx2.cpp ) set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index 04e1bad7538..c0d969e24ae 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -30,6 +30,7 @@ import subprocess import sys import os import Blender as B +import btools def normpath(path): return os.path.abspath(os.path.normpath(path)) @@ -64,49 +65,56 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: closure_dir = os.path.join(source_dir, "../closure") # get CUDA version - nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) - output, erroroutput = nvcc_pipe.communicate() + output = btools.get_command_output([nvcc, "--version"]) cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0] cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1]) - if cuda_version != 60: - print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10)) + if cuda_version != 65: + print("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported." % (cuda_version/10, cuda_version%10)) # nvcc flags nvcc_flags = "-m%s" % (bits) - nvcc_flags += " --cubin --ptxas-options=\"-v\"" + nvcc_flags += " --cubin --ptxas-options=\"-v\" --use_fast_math" nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version) nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC" nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir) + if env['WITH_BF_CYCLES_DEBUG']: + nvcc_flags += " -D__KERNEL_DEBUG__" + # dependencies dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') last_cubin_file = None + configs = (("kernel_%s.cubin", ''), + ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__')) + # add command for each cuda architecture for arch in cuda_archs: - if cuda_version < 60 and arch == "sm_50": - print("Can't build kernel for CUDA sm_50 architecture, skipping") - continue - - cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch) - - if env['BF_CYCLES_CUDA_ENV']: - MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" - command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file) - else: - command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file) - - kernel.Command(cubin_file, 'kernel.cu', command) - kernel.Depends(cubin_file, dependencies) - - kernel_binaries.append(cubin_file) - - if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: - # trick to compile one kernel at a time to reduce memory usage - if last_cubin_file: - kernel.Depends(cubin_file, last_cubin_file) - last_cubin_file = cubin_file + for config in configs: + # TODO(sergey): Use dict instead ocouple in order to increase readability? + name = config[0] + extra_flags = config[1] + + cubin_file = os.path.join(build_dir, name % arch) + current_flags = nvcc_flags + extra_flags + + if env['BF_CYCLES_CUDA_ENV']: + MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" + command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file) + else: + command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file) + + kernel.Command(cubin_file, 'kernel.cu', command) + kernel.Depends(cubin_file, dependencies) + + kernel_binaries.append(cubin_file) + + if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: + # trick to compile one kernel at a time to reduce memory usage + if last_cubin_file: + kernel.Depends(cubin_file, last_cubin_file) + last_cubin_file = cubin_file Return('kernel_binaries') diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 24b54cd9d9e..7d4783b0f3c 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -23,10 +23,7 @@ #include "../closure/bsdf_reflection.h" #include "../closure/bsdf_refraction.h" #include "../closure/bsdf_transparent.h" -#ifdef __ANISOTROPIC__ -#include "../closure/bsdf_ward.h" -#endif -#include "../closure/bsdf_westin.h" +#include "../closure/bsdf_ashikhmin_shirley.h" #include "../closure/bsdf_toon.h" #include "../closure/bsdf_hair.h" #ifdef __SUBSURFACE__ @@ -83,21 +80,22 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - label = bsdf_ward_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); @@ -110,14 +108,6 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); @@ -178,18 +168,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - eval = bsdf_ward_eval_reflect(sc, sd->I, omega_in, pdf); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; @@ -199,12 +190,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade case CLOSURE_BSDF_GLOSSY_TOON_ID: eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; @@ -245,18 +230,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - eval = bsdf_ward_eval_transmit(sc, sd->I, omega_in, pdf); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; @@ -266,12 +252,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade case CLOSURE_BSDF_GLOSSY_TOON_ID: eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; @@ -330,18 +310,19 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) bsdf_transparent_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: bsdf_microfacet_beckmann_blur(sc, roughness); break; -#ifdef __ANISOTROPIC__ - case CLOSURE_BSDF_WARD_ID: - bsdf_ward_blur(sc, roughness); + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: + bsdf_ashikhmin_shirley_blur(sc, roughness); break; -#endif case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: bsdf_ashikhmin_velvet_blur(sc, roughness); break; @@ -351,12 +332,6 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) case CLOSURE_BSDF_GLOSSY_TOON_ID: bsdf_glossy_toon_blur(sc, roughness); break; - case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID: - bsdf_westin_backscatter_blur(sc, roughness); - break; - case CLOSURE_BSDF_WESTIN_SHEEN_ID: - bsdf_westin_sheen_blur(sc, roughness); - break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: bsdf_hair_reflection_blur(sc, roughness); diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h new file mode 100644 index 00000000000..ad7864cb8ea --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -0,0 +1,210 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__ +#define __BSDF_ASHIKHMIN_SHIRLEY_H__ + +/* +ASHIKHMIN SHIRLEY BSDF + +Implementation of +Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000) + +The Fresnel factor is missing to get a separable bsdf (intensity*color), as is +the case with all other microfacet-based BSDF implementations in Cycles. + +Other than that, the implementation directly follows the paper. +*/ + +CCL_NAMESPACE_BEGIN + +ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc) +{ + /* store roughness. could already convert to exponent to save some cycles + * in eval, but this is more consistent with other bsdfs and shader_blur. */ + sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); + sc->data1 = sc->data0; + + sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID; + return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY; +} + +ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc) +{ + /* store roughness. could already convert to exponent to save some cycles + * in eval, but this is more consistent with other bsdfs and shader_blur. */ + sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); + sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); + + sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID; + return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY; +} + +ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness) +{ + sc->data0 = fmaxf(roughness, sc->data0); /* clamp roughness */ + sc->data1 = fmaxf(roughness, sc->data1); +} + +ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float roughness) +{ + return 2.0f / (roughness*roughness) - 2.0f; +} + +ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +{ + float3 N = sc->N; + + float NdotI = dot(N, I); /* in Cycles/OSL convention I is omega_out */ + float NdotO = dot(N, omega_in); /* and consequently we use for O omaga_in ;) */ + + float out = 0.0f; + + if (NdotI > 0.0f && NdotO > 0.0f) { + NdotI = fmaxf(NdotI, 1e-6f); + NdotO = fmaxf(NdotO, 1e-6f); + float3 H = normalize(omega_in + I); + float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f); + float HdotN = fmaxf(dot(H, N), 1e-6f); + + float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */ + /*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */ + + float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0); + float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1); + + if (n_x == n_y) { /* => isotropic case */ + float e = n_x; + float lobe = powf(HdotN, e); + float norm = (n_x + 1.0f) / (8.0f * M_PI_F); + + out = NdotO * norm * lobe * pump; + *pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I) (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */ + } + else { /* => ANisotropic case */ + float3 X, Y; + make_orthonormals_tangent(N, sc->T, &X, &Y); + + float HdotX = dot(H, X); + float HdotY = dot(H, Y); + float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN); + float lobe = powf(HdotN, e); + float norm = sqrtf((n_x + 1.0f)*(n_y + 1.0f)) / (8.0f * M_PI_F); + + out = NdotO * norm * lobe * pump; + *pdf = norm * lobe / HdotI; + } + } + + return make_float3(out, out, out); +} + +ccl_device float3 bsdf_ashikhmin_shirley_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x, float n_y, float randu, float randv, float *phi, float *cos_theta) +{ + *phi = atanf(sqrtf((n_x + 1.0f) / (n_y + 1.0f)) * tanf(M_PI_2_F * randu)); + float cos_phi = cosf(*phi); + float sin_phi = sinf(*phi); + *cos_theta = powf(randv, 1.0f / (n_x * cos_phi*cos_phi + n_y * sin_phi*sin_phi + 1.0f)); +} + +ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +{ + float3 N = sc->N; + + float NdotI = dot(N, I); + if (NdotI > 0.0f) { + + float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0); + float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1); + + /* get x,y basis on the surface for anisotropy */ + float3 X, Y; + + if(n_x == n_y) + make_orthonormals(N, &X, &Y); + else + make_orthonormals_tangent(N, sc->T, &X, &Y); + + /* sample spherical coords for h in tangent space */ + float phi; + float cos_theta; + if (n_x == n_y) { /* => simple isotropic sampling */ + phi = M_2PI_F * randu; + cos_theta = powf(randv, 1.0f / (n_x + 1.0f)); + } + else { /* => more complex anisotropic sampling */ + if (randu < 0.25f) { /* first quadrant */ + float remapped_randu = 4.0f * randu; + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + } + else if (randu < 0.5f) { /* second quadrant */ + float remapped_randu = 4.0f * (.5f - randu); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = M_PI_F - phi; + } + else if (randu < 0.75f) { /* third quadrant */ + float remapped_randu = 4.0f * (randu - 0.5f); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = M_PI_F + phi; + } + else { /* fourth quadrant */ + float remapped_randu = 4.0f * (1.0f - randu); + bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta); + phi = 2.0f * M_PI_F - phi; + } + } + + /* get half vector in tangent space */ + float sin_theta = sqrtf(fmaxf(0.0f, 1.0f - cos_theta*cos_theta)); + float cos_phi = cosf(phi); + float sin_phi = sinf(phi); /* no sqrt(1-cos^2) here b/c it causes artifacts */ + float3 h = make_float3( + sin_theta * cos_phi, + sin_theta * sin_phi, + cos_theta + ); + + /* half vector to world space */ + float3 H = h.x*X + h.y*Y + h.z*N; + float HdotI = dot(H, I); + if (HdotI < 0.0f) H = -H; + + /* reflect I on H to get omega_in */ + *omega_in = -I + (2.0f * HdotI) * H; + + /* leave the rest to eval_reflect */ + /* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */ + *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + /* just do the reflection thing for now */ + *domega_in_dx = (2.0f * dot(N, dIdx)) * N - dIdx; + *domega_in_dy = (2.0f * dot(N, dIdy)) * N - dIdy; +#endif + } + + return LABEL_REFLECT | LABEL_GLOSSY; +} + + +CCL_NAMESPACE_END + +#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h index 19cdb773255..e0b5454592b 100644 --- a/intern/cycles/kernel/closure/bsdf_hair.h +++ b/intern/cycles/kernel/closure/bsdf_hair.h @@ -63,7 +63,7 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc) ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -120,7 +120,7 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -166,7 +166,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; @@ -221,7 +221,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { #ifdef __HAIR__ - float offset = sc->offset; + float offset = sc->data2; float3 Tg = sc->T; #else float offset = 0.0f; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 1ec35e444fe..8737b0e2d94 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -35,20 +35,293 @@ CCL_NAMESPACE_BEGIN -/* GGX */ +/* Approximate erf and erfinv implementations. + * Implementation comes straight from Wikipedia: + * + * http://en.wikipedia.org/wiki/Error_function + * + * Some constants are baked into the code. + */ + +ccl_device_inline float approx_erff_do(float x) +{ + /* Such a clamp doesn't give much distortion to the output value + * and gives quite a few of the speedup. + */ + if(x > 3.0f) { + return 1.0f; + } + float t = 1.0f / (1.0f + 0.47047f*x); + return (1.0f - + t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x)); +} + +ccl_device_inline float approx_erff(float x) +{ + if(x >= 0.0f) { + return approx_erff_do(x); + } + else { + return -approx_erff_do(-x); + } +} + +ccl_device_inline float approx_erfinvf_do(float x) +{ + if(x <= 0.7f) { + const float x2 = x * x; + const float a1 = 0.886226899f; + const float a2 = -1.645349621f; + const float a3 = 0.914624893f; + const float a4 = -0.140543331f; + const float b1 = -2.118377725f; + const float b2 = 1.442710462f; + const float b3 = -0.329097515f; + const float b4 = 0.012229801f; + return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) / + ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f); + } + else { + const float c1 = -1.970840454f; + const float c2 = -1.624906493f; + const float c3 = 3.429567803f; + const float c4 = 1.641345311f; + const float d1 = 3.543889200f; + const float d2 = 1.637067800f; + const float z = sqrtf(-logf((1.0f - x) * 0.5f)); + return (((c4 * z + c3) * z + c2) * z + c1) / + ((d2 * z + d1) * z + 1.0f); + } +} + +ccl_device_inline float approx_erfinvf(float x) +{ + if(x >= 0.0f) { + return approx_erfinvf_do(x); + } + else { + return -approx_erfinvf_do(-x); + } +} + +/* Beckmann and GGX microfacet importance sampling from: + * + * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals. + * E. Heitz and E. d'Eon, EGSR 2014 */ + +ccl_device_inline void microfacet_beckmann_sample_slopes( + KernelGlobals *kg, + const float cos_theta_i, const float sin_theta_i, + float randu, float randv, float *slope_x, float *slope_y, + float *G1i) +{ + /* special case (normal incidence) */ + if(cos_theta_i >= 0.99999f) { + const float r = sqrtf(-logf(randu)); + const float phi = M_2PI_F * randv; + *slope_x = r * cosf(phi); + *slope_y = r * sinf(phi); + *G1i = 1.0f; + return; + } + + /* precomputations */ + const float tan_theta_i = sin_theta_i/cos_theta_i; + const float inv_a = tan_theta_i; + const float a = 1.0f/inv_a; + const float erf_a = approx_erff(a); + const float exp_a2 = expf(-a*a); + const float SQRT_PI_INV = 0.56418958354f; + const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a); + const float G1 = 1.0f/(1.0f + Lambda); /* masking */ + + *G1i = G1; + +#if 0 + const float C = 1.0f - G1 * erf_a; + + /* sample slope X */ + if(randu < C) { + /* rescale randu */ + randu = randu / C; + const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2; + const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a); + const float p = w_1 / (w_1 + w_2); + + if(randu < p) { + randu = randu / p; + *slope_x = -sqrtf(-logf(randu*exp_a2)); + } + else { + randu = (randu - p) / (1.0f - p); + *slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a); + } + } + else { + /* rescale randu */ + randu = (randu - C) / (1.0f - C); + *slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a); + + const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i); + + if(randv > p) { + *slope_x = -(*slope_x); + randv = (randv - p) / (1.0f - p); + } + else + randv = randv / p; + } + + /* sample slope Y */ + *slope_y = approx_erfinvf(2.0f*randv - 1.0f); +#else + /* use precomputed table, because it better preserves stratification + * of the random number pattern */ + int beckmann_table_offset = kernel_data.tables.beckmann_offset; + + *slope_x = lookup_table_read_2D(kg, randu, cos_theta_i, + beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE); + *slope_y = approx_erfinvf(2.0f*randv - 1.0f); +#endif + +} + +ccl_device_inline void microfacet_ggx_sample_slopes( + const float cos_theta_i, const float sin_theta_i, + float randu, float randv, float *slope_x, float *slope_y, + float *G1i) +{ + /* special case (normal incidence) */ + if(cos_theta_i >= 0.99999f) { + const float r = sqrtf(randu/(1.0f - randu)); + const float phi = M_2PI_F * randv; + *slope_x = r * cosf(phi); + *slope_y = r * sinf(phi); + *G1i = 1.0f; + + return; + } + + /* precomputations */ + const float tan_theta_i = sin_theta_i/cos_theta_i; + const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i*tan_theta_i)); + + *G1i = 1.0f/G1_inv; + + /* sample slope_x */ + const float A = 2.0f*randu*G1_inv - 1.0f; + const float AA = A*A; + const float tmp = 1.0f/(AA - 1.0f); + const float B = tan_theta_i; + const float BB = B*B; + const float D = safe_sqrtf(BB*(tmp*tmp) - (AA - BB)*tmp); + const float slope_x_1 = B*tmp - D; + const float slope_x_2 = B*tmp + D; + *slope_x = (A < 0.0f || slope_x_2*tan_theta_i > 1.0f)? slope_x_1: slope_x_2; + + /* sample slope_y */ + float S; + + if(randv > 0.5f) { + S = 1.0f; + randv = 2.0f*(randv - 0.5f); + } + else { + S = -1.0f; + randv = 2.0f*(0.5f - randv); + } + + const float z = (randv*(randv*(randv*0.27385f - 0.73369f) + 0.46341f)) / (randv*(randv*(randv*0.093073f + 0.309420f) - 1.000000f) + 0.597999f); + *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x)); +} + +ccl_device_inline float3 microfacet_sample_stretched( + KernelGlobals *kg, const float3 omega_i, + const float alpha_x, const float alpha_y, + const float randu, const float randv, + bool beckmann, float *G1i) +{ + /* 1. stretch omega_i */ + float3 omega_i_ = make_float3(alpha_x * omega_i.x, alpha_y * omega_i.y, omega_i.z); + omega_i_ = normalize(omega_i_); + + /* get polar coordinates of omega_i_ */ + float costheta_ = 1.0f; + float sintheta_ = 0.0f; + float cosphi_ = 1.0f; + float sinphi_ = 0.0f; + + if(omega_i_.z < 0.99999f) { + costheta_ = omega_i_.z; + sintheta_ = safe_sqrtf(1.0f - costheta_*costheta_); + + float invlen = 1.0f/sintheta_; + cosphi_ = omega_i_.x * invlen; + sinphi_ = omega_i_.y * invlen; + } + + /* 2. sample P22_{omega_i}(x_slope, y_slope, 1, 1) */ + float slope_x, slope_y; + + if(beckmann) { + microfacet_beckmann_sample_slopes(kg, costheta_, sintheta_, + randu, randv, &slope_x, &slope_y, G1i); + } + else { + microfacet_ggx_sample_slopes(costheta_, sintheta_, + randu, randv, &slope_x, &slope_y, G1i); + } + + /* 3. rotate */ + float tmp = cosphi_*slope_x - sinphi_*slope_y; + slope_y = sinphi_*slope_x + cosphi_*slope_y; + slope_x = tmp; + + /* 4. unstretch */ + slope_x = alpha_x * slope_x; + slope_y = alpha_y * slope_y; + + /* 5. compute normal */ + return normalize(make_float3(-slope_x, -slope_y, 1.0f)); +} + +/* GGX microfacet with Smith shadow-masking from: + * + * Microfacet Models for Refraction through Rough Surfaces + * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 + * + * Anisotropic from: + * + * Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs + * E. Heitz, Research Report 2014 + * + * Anisotropy is only supported for reflection currently, but adding it for + * transmission is just a matter of copying code from reflection if needed. */ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; } +ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc) +{ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + + sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; @@ -57,136 +330,250 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc) ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness) { - sc->data0 = fmaxf(roughness, sc->data0); /* m_ag */ + sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */ + sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */ } ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ag = max(sc->data0, 1e-4f); + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; - if(m_refractive || m_ag <= 1e-4f) - return make_float3 (0, 0, 0); + if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNI > 0 && cosNO > 0) { - // get half vector - float3 Hr = normalize(omega_in + I); - // eq. 20: (F*G*D)/(4*in*on) - // eq. 33: first we calculate D(m) with m=Hr: - float alpha2 = m_ag * m_ag; - float cosThetaM = dot(N, Hr); - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + /* get half vector */ + float3 m = normalize(omega_in + I); + float alpha2 = alpha_x * alpha_y; + float D, G1o, G1i; + + if(alpha_x == alpha_y) { + /* isotropic + * eq. 20: (F*G*D)/(4*in*on) + * eq. 33: first we calculate D(m) */ + float cosThetaM = dot(N, m); + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eq. 34: now calculate G1(i,m) and G1(o,m) */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + } + else { + /* anisotropic */ + float3 X, Y, Z = N; + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + float slope_len = 1 + slope_x*slope_x + slope_y*slope_y; + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) and G1(o,m) */ + float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO); + float cosPhiO = dot(I, X); + float sinPhiO = dot(I, Y); + + float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y); + alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO; + + G1o = 2 / (1 + safe_sqrtf(1 + alphaO2 * tanThetaO2)); + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(omega_in, X); + float sinPhiI = dot(omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2)); + } + float G = G1o * G1i; - float out = (G * D) * 0.25f / cosNO; - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / dot(Hr, I); - return make_float3 (out, out, out); + + /* eq. 20 */ + float common = D * 0.25f / cosNO; + float out = G * common; + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* eq. 38 - but see also: + * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf + * pdf = pm * 0.25 / dot(m, I); */ + *pdf = G1o * common; + + return make_float3(out, out, out); } - return make_float3 (0, 0, 0); + + return make_float3(0, 0, 0); } ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ag = max(sc->data0, 1e-4f); - float m_eta = sc->data1; + float alpha_x = sc->data0; + float alpha_y = sc->data1; + float m_eta = sc->data2; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; - if(!m_refractive || m_ag <= 1e-4f) - return make_float3 (0, 0, 0); + if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO <= 0 || cosNI >= 0) - return make_float3 (0, 0, 0); // vectors on same side -- not possible - // compute half-vector of the refraction (eq. 16) + return make_float3(0, 0, 0); /* vectors on same side -- not possible */ + + /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); float3 Ht = normalize(ht); float cosHO = dot(Ht, I); - float cosHI = dot(Ht, omega_in); - // eq. 33: first we calculate D(m) with m=Ht: - float alpha2 = m_ag * m_ag; + + /* those situations makes chi+ terms in eq. 33, 34 be zero */ + if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f) + return make_float3(0.0f, 0.0f, 0.0f); + + float D, G1o, G1i; + + /* eq. 33: first we calculate D(m) with m=Ht: */ + float alpha2 = alpha_x * alpha_y; float cosThetaM = dot(N, Ht); float cosThetaM2 = cosThetaM * cosThetaM; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eq. 34: now calculate G1(i,m) and G1(o,m) */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + float G = G1o * G1i; - // probability - float invHt2 = 1 / dot(ht, ht); - *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO; - return make_float3 (out, out, out); + + /* probability */ + float Ht2 = dot(ht, ht); + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2) + * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + + return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { - float m_ag = sc->data0; + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; float3 N = sc->N; float cosNO = dot(N, I); if(cosNO > 0) { float3 X, Y, Z = N; - make_orthonormals(Z, &X, &Y); - // generate a random microfacet normal m - // eq. 35,36: - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alpha2 = m_ag * m_ag; - float tanThetaM2 = alpha2 * randu / (1 - randu); - float cosThetaM = 1 / safe_sqrtf(1 + tanThetaM2); - float sinThetaM = cosThetaM * safe_sqrtf(tanThetaM2); - float phiM = M_2PI_F * randv; - float3 m = (cosf(phiM) * sinThetaM) * X + - (sinf(phiM) * sinThetaM) * Y + - ( cosThetaM) * Z; + + if(alpha_x == alpha_y) + make_orthonormals(Z, &X, &Y); + else + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* importance sampling with distribution of visible normals. vectors are + * transformed to local space before and after */ + float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO); + float3 local_m; + float G1o; + + local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_y, + randu, randv, false, &G1o); + + float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z; + float cosThetaM = local_m.z; + + /* reflection or refraction? */ if(!m_refractive) { float cosMO = dot(m, I); + if(cosMO > 0) { - // eq. 39 - compute actual reflected direction + /* eq. 39 - compute actual reflected direction */ *omega_in = 2 * cosMO * m - I; + if(dot(Ng, *omega_in) > 0) { - if (m_ag <= 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // microfacet normal is visible to this ray - // eq. 33 - float cosThetaM2 = cosThetaM * cosThetaM; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / cosMO; - // eval BRDF*cosNI - float cosNI = dot(N, *omega_in); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); - float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); - float G = G1o * G1i; - // eq. 20: (F*G*D)/(4*in*on) - float out = (G * D) * 0.25f / cosNO; + /* microfacet normal is visible to this ray */ + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; + float D, G1i; + + if(alpha_x == alpha_y) { + /* isotropic */ + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + /* eval BRDF*cosNI */ + float cosNI = dot(N, *omega_in); + + /* eq. 34: now calculate G1(i,m) */ + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + } + else { + /* anisotropic distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + float slope_len = 1 + slope_x*slope_x + slope_y*slope_y; + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4); + + /* calculate G1(i,m) */ + float cosNI = dot(N, *omega_in); + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(*omega_in, X); + float sinPhiI = dot(*omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2)); + } + + /* see eval function for derivation */ + float common = (G1o * D) * 0.25f / cosNO; + float out = G1i * common; + *pdf = common; + *eval = make_float3(out, out, out); } @@ -198,14 +585,15 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl } } else { - // CAUTION: the i and o variables are inverted relative to the paper - // eq. 39 - compute actual refractive direction + /* CAUTION: the i and o variables are inverted relative to the paper + * eq. 39 - compute actual refractive direction */ float3 R, T; #ifdef __RAY_DIFFERENTIALS__ float3 dRdx, dRdy, dTdx, dTdy; #endif - float m_eta = sc->data1; + float m_eta = sc->data2; bool inside; + fresnel_dielectric(m_eta, m, I, &R, &T, #ifdef __RAY_DIFFERENTIALS__ dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy, @@ -213,38 +601,43 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl &inside); if(!inside) { + *omega_in = T; #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = dTdx; *domega_in_dy = dTdy; #endif - if (m_ag <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // eq. 33 + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); - // eq. 24 - float pm = D * cosThetaM; - // eval BRDF*cosNI + + /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); - // eq. 34: now calculate G1(i,m) and G1(o,m) - float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + + /* eq. 34: now calculate G1(i,m) */ float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); - float G = G1o * G1i; - // eq. 21 + + /* eq. 21 */ float cosHI = dot(m, *omega_in); float cosHO = dot(m, I); float Ht2 = m_eta * cosHI + cosHO; Ht2 *= Ht2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2); - // eq. 38 and eq. 17 - *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2; + + /* see eval function for derivation */ + float common = (G1o * D) * (m_eta * m_eta) / (cosNO * Ht2); + float out = G1i * fabsf(cosHI * cosHO) * common; + *pdf = cosHO * fabsf(cosHI) * common; + *eval = make_float3(out, out, out); } } @@ -253,19 +646,33 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY; } -/* BECKMANN */ +/* Beckmann microfacet with Smith shadow-masking from: + * + * Microfacet Models for Refraction through Rough Surfaces + * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 */ ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; } +ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc) +{ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */ + + sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; +} + ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc) { - sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */ + sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */ + sc->data1 = sc->data0; /* alpha_y */ sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; @@ -273,155 +680,257 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc) ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness) { - sc->data0 = fmaxf(roughness, sc->data0); /* m_ab */ + sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */ + sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */ } ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ab = max(sc->data0, 1e-4f); + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; - if(m_refractive || m_ab <= 1e-4f) - return make_float3 (0, 0, 0); + if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO > 0 && cosNI > 0) { - // get half vector - float3 Hr = normalize(omega_in + I); - // eq. 20: (F*G*D)/(4*in*on) - // eq. 25: first we calculate D(m) with m=Hr: - float alpha2 = m_ab * m_ab; - float cosThetaM = dot(N, Hr); - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; - float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; - float G = G1o * G1i; - float out = (G * D) * 0.25f / cosNO; - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / dot(Hr, I); - return make_float3 (out, out, out); + /* get half vector */ + float3 m = normalize(omega_in + I); + + float alpha2 = alpha_x * alpha_y; + float D, G1o, G1i; + + if(alpha_x == alpha_y) { + /* isotropic + * eq. 20: (F*G*D)/(4*in*on) + * eq. 25: first we calculate D(m) */ + float cosThetaM = dot(N, m); + float cosThetaM2 = cosThetaM * cosThetaM; + float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); + + /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */ + float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + else { + /* anisotropic */ + float3 X, Y, Z = N; + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) and G1(o,m) */ + float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO); + float cosPhiO = dot(I, X); + float sinPhiO = dot(I, Y); + + float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y); + alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO; + + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(omega_in, X); + float sinPhiI = dot(omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + float ao = 1 / (safe_sqrtf(alphaO2 * tanThetaO2)); + float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2)); + G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + + float G = G1o * G1i; + + /* eq. 20 */ + float common = D * 0.25f / cosNO; + float out = G * common; + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* eq. 38 - but see also: + * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf + * pdf = pm * 0.25 / dot(m, I); */ + *pdf = G1o * common; + + return make_float3(out, out, out); } - return make_float3 (0, 0, 0); + + return make_float3(0, 0, 0); } ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) { - float m_ab = max(sc->data0, 1e-4f); - float m_eta = sc->data1; + float alpha_x = sc->data0; + float alpha_y = sc->data1; + float m_eta = sc->data2; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; - if(!m_refractive || m_ab <= 1e-4f) - return make_float3 (0, 0, 0); + if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f) + return make_float3(0, 0, 0); + float cosNO = dot(N, I); float cosNI = dot(N, omega_in); + if(cosNO <= 0 || cosNI >= 0) - return make_float3 (0, 0, 0); - // compute half-vector of the refraction (eq. 16) + return make_float3(0, 0, 0); + + /* compute half-vector of the refraction (eq. 16) */ float3 ht = -(m_eta * omega_in + I); float3 Ht = normalize(ht); float cosHO = dot(Ht, I); - float cosHI = dot(Ht, omega_in); - // eq. 33: first we calculate D(m) with m=Ht: - float alpha2 = m_ab * m_ab; + + /* those situations makes chi+ terms in eq. 25, 27 be zero */ + if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f) + return make_float3(0.0f, 0.0f, 0.0f); + + /* eq. 25: first we calculate D(m) with m=Ht: */ + float alpha2 = alpha_x * alpha_y; float cosThetaM = min(dot(N, Ht), 1.0f); float cosThetaM2 = cosThetaM * cosThetaM; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; float cosThetaM4 = cosThetaM2 * cosThetaM2; float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + + /* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */ + float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; float G = G1o * G1i; - // probability - float invHt2 = 1 / dot(ht, ht); - *pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO; - return make_float3 (out, out, out); + + /* probability */ + float Ht2 = dot(ht, ht); + + /* eq. 2 in distribution of visible normals sampling + * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ + + /* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2) + * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + + return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) { - float m_ab = sc->data0; + float alpha_x = sc->data0; + float alpha_y = sc->data1; int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID; float3 N = sc->N; float cosNO = dot(N, I); if(cosNO > 0) { float3 X, Y, Z = N; - make_orthonormals(Z, &X, &Y); - // generate a random microfacet normal m - // eq. 35,36: - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alpha2 = m_ab * m_ab; - float tanThetaM, cosThetaM; - - if(alpha2 == 0.0f) { - tanThetaM = 0.0f; - cosThetaM = 1.0f; - } - else { - tanThetaM = safe_sqrtf(-alpha2 * logf(1 - randu)); - cosThetaM = 1 / safe_sqrtf(1 + tanThetaM * tanThetaM); - } - float sinThetaM = cosThetaM * tanThetaM; - float phiM = M_2PI_F * randv; - float3 m = (cosf(phiM) * sinThetaM) * X + - (sinf(phiM) * sinThetaM) * Y + - ( cosThetaM) * Z; + if(alpha_x == alpha_y) + make_orthonormals(Z, &X, &Y); + else + make_orthonormals_tangent(Z, sc->T, &X, &Y); + + /* importance sampling with distribution of visible normals. vectors are + * transformed to local space before and after */ + float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO); + float3 local_m; + float G1o; + local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_x, + randu, randv, true, &G1o); + + float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z; + float cosThetaM = local_m.z; + + /* reflection or refraction? */ if(!m_refractive) { float cosMO = dot(m, I); + if(cosMO > 0) { - // eq. 39 - compute actual reflected direction + /* eq. 39 - compute actual reflected direction */ *omega_in = 2 * cosMO * m - I; + if(dot(Ng, *omega_in) > 0) { - if (m_ab <= 1e-4f) { - // some high number for MIS + if(fmaxf(alpha_x, alpha_y) <= 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // microfacet normal is visible to this ray - // eq. 25 - float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = tanThetaM * tanThetaM; - float cosThetaM4 = cosThetaM2 * cosThetaM2; - float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 24 - float pm = D * cosThetaM; - // convert into pdf of the sampled direction - // eq. 38 - but see also: - // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf - *pdf = pm * 0.25f / cosMO; - // Eval BRDF*cosNI - float cosNI = dot(N, *omega_in); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; - float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + /* microfacet normal is visible to this ray + * eq. 25 */ + float alpha2 = alpha_x * alpha_y; + float D, G1i; + + if(alpha_x == alpha_y) { + /* istropic distribution */ + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; + D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); + + /* eval BRDF*cosNI */ + float cosNI = dot(N, *omega_in); + + /* eq. 26, 27: now calculate G1(i,m) */ + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + else { + /* anisotropic distribution */ + float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m)); + float slope_x = -local_m.x/(local_m.z*alpha_x); + float slope_y = -local_m.y/(local_m.z*alpha_y); + + float cosThetaM = local_m.z; + float cosThetaM2 = cosThetaM * cosThetaM; + float cosThetaM4 = cosThetaM2 * cosThetaM2; + + D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4); + + /* G1(i,m) */ + float cosNI = dot(N, *omega_in); + float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI); + float cosPhiI = dot(*omega_in, X); + float sinPhiI = dot(*omega_in, Y); + + float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y); + alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI; + + float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2)); + G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; + } + float G = G1o * G1i; - // eq. 20: (F*G*D)/(4*in*on) - float out = (G * D) * 0.25f / cosNO; + + /* see eval function for derivation */ + float common = D * 0.25f / cosNO; + float out = G * common; + *pdf = G1o * common; + *eval = make_float3(out, out, out); } + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx; *domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy; @@ -430,14 +939,15 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N } } else { - // CAUTION: the i and o variables are inverted relative to the paper - // eq. 39 - compute actual refractive direction + /* CAUTION: the i and o variables are inverted relative to the paper + * eq. 39 - compute actual refractive direction */ float3 R, T; #ifdef __RAY_DIFFERENTIALS__ float3 dRdx, dRdy, dTdx, dTdy; #endif - float m_eta = sc->data1; + float m_eta = sc->data2; bool inside; + fresnel_dielectric(m_eta, m, I, &R, &T, #ifdef __RAY_DIFFERENTIALS__ dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy, @@ -446,39 +956,44 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N if(!inside) { *omega_in = T; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = dTdx; *domega_in_dy = dTdy; #endif - if (m_ab <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { - // some high number for MIS + + if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) { + /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); } else { - // eq. 33 + /* eq. 33 */ + float alpha2 = alpha_x * alpha_y; float cosThetaM2 = cosThetaM * cosThetaM; - float tanThetaM2 = tanThetaM * tanThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; + float tanThetaM2 = 1/(cosThetaM2) - 1; float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4); - // eq. 24 - float pm = D * cosThetaM; - // eval BRDF*cosNI + + /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); - // eq. 26, 27: now calculate G1(i,m) and G1(o,m) - float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO))); - float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); - float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f; + + /* eq. 26, 27: now calculate G1(i,m) */ + float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI))); float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f; float G = G1o * G1i; - // eq. 21 + + /* eq. 21 */ float cosHI = dot(m, *omega_in); float cosHO = dot(m, I); float Ht2 = m_eta * cosHI + cosHO; Ht2 *= Ht2; - float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2); - // eq. 38 and eq. 17 - *pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2; + + /* see eval function for derivation */ + float common = D * (m_eta * m_eta) / (cosNO * Ht2); + float out = G * fabsf(cosHI * cosHO) * common; + *pdf = G1o * cosHO * fabsf(cosHI) * common; + *eval = make_float3(out, out, out); } } diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b3dcb9dcc38..05816bac2c1 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -111,16 +111,20 @@ ccl_device float fresnel_dielectric_cos(float cosi, float eta) return 1.0f; // TIR(no refracted component) } -ccl_device float fresnel_conductor(float cosi, float eta, float k) +#if 0 +ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k) { - float tmp_f = eta * eta + k * k; - float tmp = tmp_f * cosi * cosi; - float Rparl2 = (tmp - (2.0f * eta * cosi) + 1)/ - (tmp + (2.0f * eta * cosi) + 1); - float Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi * cosi)/ - (tmp_f + (2.0f * eta * cosi) + cosi * cosi); + float3 cosi2 = make_float3(cosi*cosi); + float3 one = make_float3(1.0f, 1.0f, 1.0f); + float3 tmp_f = eta * eta + k * k; + float3 tmp = tmp_f * cosi2; + float3 Rparl2 = (tmp - (2.0f * eta * cosi) + one) / + (tmp + (2.0f * eta * cosi) + one); + float3 Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi2) / + (tmp_f + (2.0f * eta * cosi) + cosi2); return(Rparl2 + Rperp2) * 0.5f; } +#endif ccl_device float smooth_step(float edge0, float edge1, float x) { diff --git a/intern/cycles/kernel/closure/bsdf_ward.h b/intern/cycles/kernel/closure/bsdf_ward.h deleted file mode 100644 index c9de615a011..00000000000 --- a/intern/cycles/kernel/closure/bsdf_ward.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Adapted from Open Shading Language with this license: - * - * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. - * All Rights Reserved. - * - * Modifications Copyright 2011, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Sony Pictures Imageworks nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __BSDF_WARD_H__ -#define __BSDF_WARD_H__ - -CCL_NAMESPACE_BEGIN - -/* WARD */ - -ccl_device int bsdf_ward_setup(ShaderClosure *sc) -{ - sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* m_ax */ - sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); /* m_ay */ - - sc->type = CLOSURE_BSDF_WARD_ID; - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_ward_blur(ShaderClosure *sc, float roughness) -{ - sc->data0 = fmaxf(roughness, sc->data0); /* m_ax */ - sc->data1 = fmaxf(roughness, sc->data1); /* m_ay */ -} - -ccl_device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_ax = sc->data0; - float m_ay = sc->data1; - float3 N = sc->N; - float3 T = sc->T; - - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - - if(cosNI > 0.0f && cosNO > 0.0f) { - cosNO = max(cosNO, 1e-4f); - cosNI = max(cosNI, 1e-4f); - - // get half vector and get x,y basis on the surface for anisotropy - float3 H = normalize(omega_in + I); // normalize needed for pdf - float3 X, Y; - make_orthonormals_tangent(N, T, &X, &Y); - // eq. 4 - float dotx = dot(H, X) / m_ax; - float doty = dot(H, Y) / m_ay; - float dotn = dot(H, N); - float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn); - float denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI)); - float exp_val = expf(-exp_arg); - float out = cosNI * exp_val / denom; - float oh = dot(H, I); - denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn; - *pdf = exp_val / denom; - return make_float3 (out, out, out); - } - - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_ax = sc->data0; - float m_ay = sc->data1; - float3 N = sc->N; - float3 T = sc->T; - - float cosNO = dot(N, I); - if(cosNO > 0.0f) { - // get x,y basis on the surface for anisotropy - float3 X, Y; - make_orthonormals_tangent(N, T, &X, &Y); - // generate random angles for the half vector - // eq. 7 (taking care around discontinuities to keep - //ttoutput angle in the right quadrant) - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float alphaRatio = m_ay / m_ax; - float cosPhi, sinPhi; - if(randu < 0.25f) { - float val = 4 * randu; - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = tanPhi * cosPhi; - } - else if(randu < 0.5f) { - float val = 1 - 4 * (0.5f - randu); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - // phi = M_PI_F - phi; - cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = -tanPhi * cosPhi; - } - else if(randu < 0.75f) { - float val = 4 * (randu - 0.5f); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - //phi = M_PI_F + phi; - cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = tanPhi * cosPhi; - } - else { - float val = 1 - 4 * (1 - randu); - float tanPhi = alphaRatio * tanf(M_PI_2_F * val); - // phi = M_2PI_F - phi; - cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi); - sinPhi = -tanPhi * cosPhi; - } - // eq. 6 - // we take advantage of cos(atan(x)) == 1/sqrt(1+x^2) - //tttt and sin(atan(x)) == x/sqrt(1+x^2) - float thetaDenom = (cosPhi * cosPhi) / (m_ax * m_ax) + (sinPhi * sinPhi) / (m_ay * m_ay); - float tanTheta2 = -logf(1 - randv) / thetaDenom; - float cosTheta = 1 / sqrtf(1 + tanTheta2); - float sinTheta = cosTheta * sqrtf(tanTheta2); - - float3 h; // already normalized becaused expressed from spherical coordinates - h.x = sinTheta * cosPhi; - h.y = sinTheta * sinPhi; - h.z = cosTheta; - // compute terms that are easier in local space - float dotx = h.x / m_ax; - float doty = h.y / m_ay; - float dotn = h.z; - // transform to world space - h = h.x * X + h.y * Y + h.z * N; - // generate the final sample - float oh = dot(h, I); - *omega_in = 2.0f * oh * h - I; - if(dot(Ng, *omega_in) > 0) { - float cosNI = dot(N, *omega_in); - if(cosNI > 0) { - cosNO = max(cosNO, 1e-4f); - cosNI = max(cosNI, 1e-4f); - - // eq. 9 - float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn); - float denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn; - *pdf = expf(-exp_arg) / denom; - // compiler will reuse expressions already computed - denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI)); - float power = cosNI * expf(-exp_arg) / denom; - *eval = make_float3(power, power, power); -#ifdef __RAY_DIFFERENTIALS__ - *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx; - *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy; -#endif - } - } - } - return LABEL_REFLECT|LABEL_GLOSSY; -} - -CCL_NAMESPACE_END - -#endif /* __BSDF_WARD_H__ */ - diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h deleted file mode 100644 index 9dc1c00bb3d..00000000000 --- a/intern/cycles/kernel/closure/bsdf_westin.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Adapted from Open Shading Language with this license: - * - * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al. - * All Rights Reserved. - * - * Modifications Copyright 2011, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Sony Pictures Imageworks nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __BSDF_WESTIN_H__ -#define __BSDF_WESTIN_H__ - -CCL_NAMESPACE_BEGIN - -/* WESTIN BACKSCATTER */ - -ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc) -{ - float roughness = sc->data0; - roughness = clamp(roughness, 1e-5f, 1.0f); - float m_invroughness = 1.0f/roughness; - - sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID; - sc->data0 = m_invroughness; - - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness) -{ - float m_invroughness = sc->data0; - m_invroughness = min(1.0f/roughness, m_invroughness); - sc->data0 = m_invroughness; -} - -ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_invroughness = sc->data0; - float3 N = sc->N; - - // pdf is implicitly 0 (no indirect sampling) - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - if(cosNO > 0 && cosNI > 0) { - float cosine = dot(I, omega_in); - *pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0; - *pdf *= 0.5f * M_1_PI_F; - return make_float3 (*pdf, *pdf, *pdf); - } - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_invroughness = sc->data0; - float3 N = sc->N; - - float cosNO = dot(N, I); - if(cosNO > 0) { -#ifdef __RAY_DIFFERENTIALS__ - *domega_in_dx = dIdx; - *domega_in_dy = dIdy; -#endif - float3 T, B; - make_orthonormals (I, &T, &B); - float phi = M_2PI_F * randu; - float cosTheta = powf(randv, 1 / (m_invroughness + 1)); - float sinTheta2 = 1 - cosTheta * cosTheta; - float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0; - *omega_in = (cosf(phi) * sinTheta) * T + - (sinf(phi) * sinTheta) * B + - (cosTheta) * I; - if(dot(Ng, *omega_in) > 0) { - // common terms for pdf and eval - float cosNI = dot(N, *omega_in); - // make sure the direction we chose is still in the right hemisphere - if(cosNI > 0) - { - *pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness); - *pdf = (m_invroughness + 1) * (*pdf); - *eval = make_float3(*pdf, *pdf, *pdf); - } - } - } - return LABEL_REFLECT|LABEL_GLOSSY; -} - -/* WESTIN SHEEN */ - -ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc) -{ - /* float edginess = sc->data0; */ - sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID; - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY; -} - -ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness) -{ -} - -ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - float m_edginess = sc->data0; - float3 N = sc->N; - - // pdf is implicitly 0 (no indirect sampling) - float cosNO = dot(N, I); - float cosNI = dot(N, omega_in); - if(cosNO > 0 && cosNI > 0) { - float sinNO2 = 1 - cosNO * cosNO; - *pdf = cosNI * M_1_PI_F; - float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0; - return make_float3 (westin, westin, westin); - } - return make_float3 (0, 0, 0); -} - -ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) -{ - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) -{ - float m_edginess = sc->data0; - float3 N = sc->N; - - // we are viewing the surface from the right side - send a ray out with cosine - // distribution over the hemisphere - sample_cos_hemisphere(N, randu, randv, omega_in, pdf); - if(dot(Ng, *omega_in) > 0) { - // TODO: account for sheen when sampling - float cosNO = dot(N, I); - float sinNO2 = 1 - cosNO * cosNO; - float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0; - *eval = make_float3(westin, westin, westin); -#ifdef __RAY_DIFFERENTIALS__ - // TODO: find a better approximation for the diffuse bounce - *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx; - *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy; -#endif - } - else { - pdf = 0; - } - return LABEL_REFLECT|LABEL_DIFFUSE; -} - -CCL_NAMESPACE_END - -#endif /* __BSDF_WESTIN_H__ */ - diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index dd7c25d581d..c5336e086b7 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -28,6 +28,13 @@ CCL_NAMESPACE_BEGIN +/* Don't inline intersect functions on GPU, this is faster */ +#ifdef __KERNEL_GPU__ +#define ccl_device_intersect ccl_device_noinline +#else +#define ccl_device_intersect ccl_device_inline +#endif + /* BVH intersection function variations */ #define BVH_INSTANCING 1 @@ -35,6 +42,8 @@ CCL_NAMESPACE_BEGIN #define BVH_HAIR 4 #define BVH_HAIR_MINIMUM_WIDTH 8 +/* Regular BVH traversal */ + #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 #include "geom_bvh_traversal.h" @@ -63,6 +72,8 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_traversal.h" #endif +/* Subsurface scattering BVH traversal */ + #if defined(__SUBSURFACE__) #define BVH_FUNCTION_NAME bvh_intersect_subsurface #define BVH_FUNCTION_FEATURES 0 @@ -93,47 +104,72 @@ CCL_NAMESPACE_BEGIN #include "geom_bvh_subsurface.h" #endif +/* Record all BVH intersection for shadows */ + #if defined(__SHADOW_RECORD_ALL__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all #define BVH_FUNCTION_FEATURES 0 #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__INSTANCING__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing #define BVH_FUNCTION_FEATURES BVH_INSTANCING #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__HAIR__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION #include "geom_bvh_shadow.h" #endif -#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION #include "geom_bvh_shadow.h" #endif -/* to work around titan bug when using arrays instead of textures */ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline +/* Camera inside Volume BVH intersection */ + +#if defined(__VOLUME__) +#define BVH_FUNCTION_NAME bvh_intersect_volume +#define BVH_FUNCTION_FEATURES 0 +#include "geom_bvh_volume.h" #endif -#ifdef __HAIR__ -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) -#else -bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) + +#if defined(__VOLUME__) && defined(__INSTANCING__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing +#define BVH_FUNCTION_FEATURES BVH_INSTANCING +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION +#include "geom_bvh_volume.h" +#endif + +#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) +#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion +#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION +#include "geom_bvh_volume.h" #endif + +ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, + uint *lcg_state, float difl, float extmax) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -170,14 +206,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I #endif /* __KERNEL_CPU__ */ } -/* to work around titan bug when using arrays instead of textures */ #ifdef __SUBSURFACE__ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) +ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -215,14 +245,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection } #endif -/* to work around titan bug when using arrays instead of textures */ #ifdef __SHADOW_RECORD_ALL__ -#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) -ccl_device_inline -#else -ccl_device_noinline -#endif -uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { @@ -240,20 +264,50 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits); #endif /* __HAIR__ */ -#ifdef __KERNEL_CPU__ - #ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); #endif /* __INSTANCING__ */ return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); +} +#endif + +#ifdef __VOLUME__ +ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, + const Ray *ray, + Intersection *isect) +{ +#ifdef __OBJECT_MOTION__ + if(kernel_data.bvh.have_motion) { +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_hair_motion(kg, ray, isect); +#endif /* __HAIR__ */ + + return bvh_intersect_volume_motion(kg, ray, isect); + } +#endif /* __OBJECT_MOTION__ */ + +#ifdef __HAIR__ + if(kernel_data.bvh.have_curves) + return bvh_intersect_volume_hair(kg, ray, isect); +#endif /* __HAIR__ */ + +#ifdef __KERNEL_CPU__ + +#ifdef __INSTANCING__ + if(kernel_data.bvh.have_instancing) + return bvh_intersect_volume_instancing(kg, ray, isect); +#endif /* __INSTANCING__ */ + + return bvh_intersect_volume(kg, ray, isect); #else /* __KERNEL_CPU__ */ #ifdef __INSTANCING__ - return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_volume_instancing(kg, ray, isect); #else - return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_volume(kg, ray, isect); #endif /* __INSTANCING__ */ #endif /* __KERNEL_CPU__ */ diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h index 98bf82b3b2d..aee4097d77e 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h @@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; /* calculate { c0min, c1min, -c0max, -c1max} */ - __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); - const __m128 tminmax = _mm_xor_ps(minmax, pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -254,8 +252,7 @@ ccl_device bool BVH_FUNCTION_NAME if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) #endif { - float4 Ns = kernel_tex_fetch(__tri_normal, prim); - shader = __float_as_int(Ns.w); + shader = kernel_tex_fetch(__tri_shader, prim); } #ifdef __HAIR__ else { @@ -301,12 +298,12 @@ ccl_device bool BVH_FUNCTION_NAME num_hits_in_instance = 0; #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); isect_array->t = isect_t; - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -348,13 +345,13 @@ ccl_device bool BVH_FUNCTION_NAME } #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); isect_t = tmax; isect_array->t = isect_t; - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h index a19f05dd371..a8f57cffa78 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h @@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn); - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + /* calculate { c0min, c1min, -c0max, -c1max} */ + const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h index 9fd40f91471..114d30a479d 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h @@ -63,24 +63,28 @@ ccl_device bool BVH_FUNCTION_NAME #endif isect->t = ray->t; - isect->object = OBJECT_NONE; - isect->prim = PRIM_NONE; isect->u = 0.0f; isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps = 0; +#endif #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0)); - __m128 Psplat[3], idirsplat[3]; + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; shuffle_swap_t shufflexyz[3]; - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -151,17 +155,17 @@ ccl_device bool BVH_FUNCTION_NAME /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ /* fetch node data */ - const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; const float4 cnodes = ((float4*)bvh_nodes)[3]; /* intersect ray against child nodes */ - const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]); - const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]); - const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]); + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; /* calculate { c0min, c1min, -c0max, -c1max} */ - __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)); - const __m128 tminmax = _mm_xor_ps(minmax, pn); + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; #if FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { @@ -182,16 +186,16 @@ ccl_device bool BVH_FUNCTION_NAME } #endif - const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax)); + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); /* decide which nodes to traverse next */ #ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); #else - traverseChild0 = (_mm_movemask_ps(lrhit) & 1); - traverseChild1 = (_mm_movemask_ps(lrhit) & 2); + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); #endif #endif // __KERNEL_SSE2__ @@ -203,9 +207,7 @@ ccl_device bool BVH_FUNCTION_NAME #if !defined(__KERNEL_SSE2__) bool closestChild1 = (c1min < c0min); #else - union { __m128 m128; float v[4]; } uminmax; - uminmax.m128 = tminmax; - bool closestChild1 = uminmax.v[1] < uminmax.v[0]; + bool closestChild1 = tminmax[1] < tminmax[0]; #endif if(closestChild1) { @@ -228,6 +230,10 @@ ccl_device bool BVH_FUNCTION_NAME --stackPtr; } } + +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif } /* if node is leaf, fetch triangle list */ @@ -276,13 +282,17 @@ ccl_device bool BVH_FUNCTION_NAME } } +#if defined(__KERNEL_DEBUG__) + isect->num_traversal_steps++; +#endif + /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) if(hit) { if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); } #else if(hit && visibility == PATH_RAY_SHADOW_OPAQUE) @@ -304,11 +314,11 @@ ccl_device bool BVH_FUNCTION_NAME #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif @@ -334,11 +344,11 @@ ccl_device bool BVH_FUNCTION_NAME #endif #if defined(__KERNEL_SSE2__) - Psplat[0] = _mm_set_ps1(P.x); - Psplat[1] = _mm_set_ps1(P.y); - Psplat[2] = _mm_set_ps1(P.z); + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); - tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f); + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h new file mode 100644 index 00000000000..9dd8d226f5b --- /dev/null +++ b/intern/cycles/kernel/geom/geom_bvh_volume.h @@ -0,0 +1,322 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function for volumes, where + * various features can be enabled/disabled. This way we can compile optimized + * versions for each case without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_MOTION: motion blur rendering + * + */ + +#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) + +ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg, + const Ray *ray, + Intersection *isect) +{ + /* todo: + * - test if pushing distance on the stack helps (for non shadow rays) + * - separate version for shadow rays + * - likely and unlikely for if() statements + * - test restrict attribute for pointers + */ + + /* traversal stack in CUDA thread-local memory */ + int traversalStack[BVH_STACK_SIZE]; + traversalStack[0] = ENTRYPOINT_SENTINEL; + + /* traversal variables in registers */ + int stackPtr = 0; + int nodeAddr = kernel_data.bvh.root; + + /* ray parameters in registers */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + + const uint visibility = PATH_RAY_ALL_VISIBILITY; + +#if FEATURE(BVH_MOTION) + Transform ob_tfm; +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + +#if defined(__KERNEL_SSE2__) + const shuffle_swap_t shuf_identity = shuffle_swap_identity(); + const shuffle_swap_t shuf_swap = shuffle_swap_swap(); + + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + ssef Psplat[3], idirsplat[3]; + shuffle_swap_t shufflexyz[3]; + + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + /* traversal loop */ + do { + do { + /* traverse internal nodes */ + while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + bool traverseChild0, traverseChild1; + int nodeAddrChild1; + +#if !defined(__KERNEL_SSE2__) + /* Intersect two child bounding boxes, non-SSE version */ + float t = isect->t; + + /* fetch node data */ + float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); + float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); + float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); + + /* intersect ray against child nodes */ + NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (c0max >= c0min); + traverseChild1 = (c1max >= c1min); +#endif + +#else // __KERNEL_SSE2__ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; + const float4 cnodes = ((float4*)bvh_nodes)[3]; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + /* decide which nodes to traverse next */ +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); + traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); +#else + traverseChild0 = (movemask(lrhit) & 1); + traverseChild1 = (movemask(lrhit) & 2); +#endif +#endif // __KERNEL_SSE2__ + + nodeAddr = __float_as_int(cnodes.x); + nodeAddrChild1 = __float_as_int(cnodes.y); + + if(traverseChild0 && traverseChild1) { + /* both children were intersected, push the farther one */ +#if !defined(__KERNEL_SSE2__) + bool closestChild1 = (c1min < c0min); +#else + bool closestChild1 = tminmax[1] < tminmax[0]; +#endif + + if(closestChild1) { + int tmp = nodeAddr; + nodeAddr = nodeAddrChild1; + nodeAddrChild1 = tmp; + } + + ++stackPtr; + traversalStack[stackPtr] = nodeAddrChild1; + } + else { + /* one child was intersected */ + if(traverseChild1) { + nodeAddr = nodeAddrChild1; + } + else if(!traverseChild0) { + /* neither child was intersected */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } + + /* if node is leaf, fetch triangle list */ + if(nodeAddr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1)); + int primAddr = __float_as_int(leaf.x); + +#if FEATURE(BVH_INSTANCING) + if(primAddr >= 0) { +#endif + int primAddr2 = __float_as_int(leaf.y); + + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + + /* primitive intersection */ + for(; primAddr < primAddr2; primAddr++) { + /* only primitives from volume object */ + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + continue; + } + + /* intersect ray against primitive */ + uint type = kernel_tex_fetch(__prim_type, primAddr); + + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + triangle_intersect(kg, isect, P, dir, visibility, object, primAddr); + break; + } +#if FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + break; + } +#endif +#if FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + else + bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0); + break; + } +#endif + default: { + break; + } + } + } + } +#if FEATURE(BVH_INSTANCING) + else { + /* instance push */ + object = kernel_tex_fetch(__prim_object, -primAddr-1); + int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VOLUME) { + +#if FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + ++stackPtr; + traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + + nodeAddr = kernel_tex_fetch(__object_node, object); + } + else { + /* pop */ + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } + } + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + +#if FEATURE(BVH_INSTANCING) + if(stackPtr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* instance pop */ +#if FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm); +#else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +#endif + +#if defined(__KERNEL_SSE2__) + Psplat[0] = ssef(P.x); + Psplat[1] = ssef(P.y); + Psplat[2] = ssef(P.z); + + tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); + + gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); +#endif + + object = OBJECT_NONE; + nodeAddr = traversalStack[stackPtr]; + --stackPtr; + } +#endif + } while(nodeAddr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef FEATURE +#undef BVH_FUNCTION_NAME +#undef BVH_FUNCTION_FEATURES + diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index e1d225436a6..b6d21c91916 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -214,9 +214,9 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a) +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) { - return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2]))); + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); } #endif @@ -238,16 +238,16 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int prim = kernel_tex_fetch(__prim_index, curveAddr); #ifdef __KERNEL_SSE2__ - __m128 vdir = load_m128(dir); - __m128 vcurve_coef[4]; + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; const float3 *curve_coef = (float3 *)vcurve_coef; { - __m128 dtmp = _mm_mul_ps(vdir, vdir); - __m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp))); - __m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss); + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; - __m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]); + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); int2 &v00 = (int2 &)v00vec; int k0 = v00.x + segment; @@ -255,44 +255,44 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect int ka = max(k0 - 1, v00.x); int kb = min(k1 + 1, v00.x + v00.y - 1); - __m128 P_curve[4]; + ssef P_curve[4]; if(type & PRIMITIVE_CURVE) { - P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[ka].x); - P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k0].x); - P_curve[2] = _mm_load_ps(&kg->__curve_keys.data[k1].x); - P_curve[3] = _mm_load_ps(&kg->__curve_keys.data[kb].x); + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); } else { int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); } - __m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss)); - __m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn); - __m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy); - __m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - __m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0))); + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - __m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - __m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0); - __m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - __m128 htfm[] = { htfm0, htfm1, htfm2 }; - __m128 vP = load_m128(P); - __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P_curve[0], vP)); - __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P_curve[1], vP)); - __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P_curve[2], vP)); - __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P_curve[3], vP)); + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); float fc = 0.71f; - __m128 vfc = _mm_set1_ps(fc); - __m128 vfcxp3 = _mm_mul_ps(vfc, p3); + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; vcurve_coef[0] = p1; - vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0)); - vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3)); + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); r_st = ((float4 &)P_curve[1]).w; r_en = ((float4 &)P_curve[2]).w; @@ -386,12 +386,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect float i_st = tree * resol; float i_en = i_st + (level * resol); #ifdef __KERNEL_SSE2__ - __m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en); - __m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - __m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - __m128 vbmin = _mm_min_ps(vp_st, vp_en); - __m128 vbmax = _mm_max_ps(vp_st, vp_en); + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; @@ -600,13 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect #endif { /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; isect->prim = curveAddr; isect->object = object; isect->type = type; - isect->u = u; - isect->v = gd; - /*isect->transparency = 1.0f - coverage; */ - isect->t = t; hit = true; } @@ -679,38 +678,38 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float sphere_b_tmp = dot3(dir, sphere_dif1); float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; #else - __m128 P_curve[2]; + ssef P_curve[2]; if(type & PRIMITIVE_CURVE) { - P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[k0].x); - P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k1].x); + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); } else { int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); } - const __m128 or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - __m128 r12 = or12; - const __m128 vP = load_m128(P); - const __m128 dif = _mm_sub_ps(vP, P_curve[0]); - const __m128 dif_second = _mm_sub_ps(vP, P_curve[1]); + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; if(difl != 0.0f) { - const __m128 len1_sq = len3_squared_splat(dif); - const __m128 len2_sq = len3_squared_splat(dif_second); - const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax)); - r12 = _mm_max_ps(or12, pixelsize12); + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); } - float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12)); - float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12)); - - const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]); - const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f)); - const __m128 dir = load_m128(direction); - const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1); + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1); #endif float mr = max(r1, r2); @@ -728,7 +727,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #ifndef __KERNEL_SSE2__ float3 tg = p21_diff * invl; #else - const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl)); + const ssef tg = p21_diff * invl; #endif float gd = (r2 - r1) * invl; @@ -752,7 +751,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec float3 cprod = cross(tg, dir); float cprod2sq = len3_squared(cross(tg, dif)); #else - const __m128 cprod = cross(tg, dir); + const ssef cprod = cross(tg, dir); float cprod2sq = len3_squared(cross_zxy(tg, dif)); #endif float cprodsq = len3_squared(cprod); @@ -770,7 +769,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #ifndef __KERNEL_SSE2__ float3 tdif = dif + tcentre * dir; #else - const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif); + const ssef tdif = madd(ssef(tcentre), dir, dif); #endif float tdifz = dot3(tdif, tg); float tdifma = tdifz*gd + r1; @@ -836,13 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec #endif { /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; isect->prim = curveAddr; isect->object = object; isect->type = type; - isect->u = z*invl; - isect->v = gd; - /*isect->transparency = 1.0f - adjradius;*/ - isect->t = t; return true; } @@ -938,9 +936,10 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con sd->u = isect->u; sd->v = 0.0f; #endif - + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); sd->Ng = normalize(-(D - tg * (dot(tg, D)))); } else { @@ -952,7 +951,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float gd = isect->v; if(gd != 0.0f) { - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); sd->Ng = sd->Ng - gd * tg; sd->Ng = normalize(sd->Ng); } @@ -1012,10 +1010,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con sd->dPdv = cross(tg, sd->Ng); #endif - /*add fading parameter for minimum pixel width with transparency bsdf*/ - /*sd->curve_transparency = isect->transparency;*/ - /*sd->curve_radius = sd->u * gd * l + r1;*/ - if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ Transform tfm = sd->ob_tfm; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index 73338bb6b3b..3a4b20e61aa 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -233,8 +233,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface) { /* get shader */ - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - sd->shader = __float_as_int(Ns.w); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* get motion info */ int numsteps, numverts; @@ -273,7 +272,11 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD #endif /* compute face normal */ - float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); + float3 Ng; + if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); + else + Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); sd->Ng = Ng; sd->N = Ng; @@ -327,14 +330,21 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection float t, u, v; if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) { - isect->prim = triAddr; - isect->object = object; - isect->type = PRIMITIVE_MOTION_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility) +#endif + { + isect->t = t; + isect->u = u; + isect->v = v; + isect->prim = triAddr; + isect->object = object; + isect->type = PRIMITIVE_MOTION_TRIANGLE; - return true; + return true; + } } return false; @@ -378,12 +388,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I /* record intersection */ Intersection *isect = &isect_array[hit]; + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_MOTION_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; } } #endif diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 533973621d7..5df6c75df86 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -143,6 +143,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) /* center position */ float3 center; +#ifdef __HAIR__ if(sd->type & PRIMITIVE_ALL_CURVE) { center = curve_motion_center_location(kg, sd); @@ -150,6 +151,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) object_position_transform(kg, sd, ¢er); } else +#endif center = sd->P; float3 motion_pre = center, motion_post = center; diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 355e36fef0c..c08a82ee038 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -18,7 +18,7 @@ /* Triangle Primitive * * Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH - * ray intersection we use a precomputed triangle storage to accelarate + * ray intersection we use a precomputed triangle storage to accelerate * intersection at the cost of more memory usage */ CCL_NAMESPACE_BEGIN @@ -116,11 +116,28 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderDat #endif } +/* normal on triangle */ +ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) +{ + /* load triangle vertices */ + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + + float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); + float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); + float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + + /* return normal */ + if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + return normalize(cross(v2 - v0, v1 - v0)); + else + return normalize(cross(v1 - v0, v2 - v0)); +} + /* point and normal on triangle */ -ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) +ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) { /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -130,16 +147,24 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float float t = 1.0f - u - v; *P = (u*v0 + v*v1 + t*v2); - float4 Nm = kernel_tex_fetch(__tri_normal, prim); - *Ng = make_float3(Nm.x, Nm.y, Nm.z); - *shader = __float_as_int(Nm.w); + /* get object flags, instance-aware */ + int object_flag = kernel_tex_fetch(__object_flag, object >= 0 ? object : ~object); + + /* compute normal */ + if(object_flag & SD_NEGATIVE_SCALE_APPLIED) + *Ng = normalize(cross(v2 - v0, v1 - v0)); + else + *Ng = normalize(cross(v1 - v0, v2 - v0)); + + /* shader`*/ + *shader = kernel_tex_fetch(__tri_shader, prim); } /* Triangle vertex locations */ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3]) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -151,7 +176,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) { /* load triangle vertices */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); @@ -165,7 +190,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv) { /* fetch triangle vertex coordinates */ - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); @@ -187,7 +212,7 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s return kernel_tex_fetch(__attributes_float, offset + sd->prim); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); @@ -230,7 +255,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); @@ -243,11 +268,20 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } - else if(elem == ATTR_ELEMENT_CORNER) { + else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) { int tri = offset + sd->prim*3; - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + float3 f0, f1, f2; + + if(elem == ATTR_ELEMENT_CORNER) { + f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); + f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); + f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + } + else { + f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 0)); + f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 1)); + f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 2)); + } #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; @@ -300,12 +334,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect #endif { /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; return true; } } @@ -363,12 +397,12 @@ ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersec /* record intersection */ Intersection *isect = &isect_array[hit]; + isect->t = t; + isect->u = u; + isect->v = v; isect->prim = triAddr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = u; - isect->v = v; - isect->t = t; } } } diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 963d6cbee9c..3cb6d168f80 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -49,7 +49,15 @@ ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); - float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#ifdef __KERNEL_GPU__ + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +#else + float4 r; + if(sd->flag & SD_VOLUME_CUBIC) + r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC); + else + r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#endif if(dx) *dx = 0.0f; if(dx) *dy = 0.0f; @@ -61,7 +69,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); - float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#ifdef __KERNEL_GPU__ + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +#else + float4 r; + if(sd->flag & SD_VOLUME_CUBIC) + r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC); + else + r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z); +#endif if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl index 6988ad6027f..4f20ef9ca15 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernel.cl @@ -23,7 +23,7 @@ #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, @@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader( ccl_global type *name, #include "kernel_textures.h" - int type, int sx, int sw) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -128,6 +128,31 @@ __kernel void kernel_ocl_shader( int x = sx + get_global_id(0); if(x < sx + sw) - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x); + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); +} + +__kernel void kernel_ocl_bake( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel_textures.h" + + int type, int sx, int sw, int offset, int sample) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel_textures.h" + + int x = sx + get_global_id(0); + + if(x < sx + sw) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); } diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index 173028d50c8..fa2113fbb46 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -23,7 +23,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -120,9 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu /* Shader Evaluation */ -void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index 636e48b5456..489daacddde 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -22,7 +22,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" /* device data taken from CUDA occupancy calculator */ @@ -52,8 +52,20 @@ #define CUDA_KERNEL_MAX_REGISTERS 63 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 -/* 5.0 */ -#elif __CUDA_ARCH__ == 500 +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +#define CUDA_BLOCK_MAX_THREADS 1024 +#define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +#define CUDA_THREADS_BLOCK_WIDTH 16 +#define CUDA_KERNEL_MAX_REGISTERS 63 +#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0 and 5.2 */ +#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 #define CUDA_BLOCK_MAX_THREADS 1024 @@ -61,12 +73,12 @@ /* tunable parameters */ #define CUDA_THREADS_BLOCK_WIDTH 16 -#define CUDA_KERNEL_MAX_REGISTERS 63 +#define CUDA_KERNEL_MAX_REGISTERS 40 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 /* unknown architecture */ #else -#error "Unknown or unuspported CUDA architecture, can't determine launch bounds" +#error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif /* compute number of threads per block and minimum blocks per multiprocessor @@ -146,11 +158,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) +kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + if(x < sx + sw) + kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x); + if(x < sx + sw) + kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample); } #endif diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index c4a08646bab..19e06b88797 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, @@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 @@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 @@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX @@ -84,7 +84,18 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, - int type, int i); + int type, int i, int offset, int sample); +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, + int sample, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, + int type, int i, int offset, int sample); #endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index b4f6dcdace9..b0efcdc66a7 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -32,10 +32,11 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v eval->transmission = make_float3(0.0f, 0.0f, 0.0f); eval->transparent = make_float3(0.0f, 0.0f, 0.0f); eval->subsurface = make_float3(0.0f, 0.0f, 0.0f); + eval->scatter = make_float3(0.0f, 0.0f, 0.0f); if(type == CLOSURE_BSDF_TRANSPARENT_ID) eval->transparent = value; - else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type)) + else if(CLOSURE_IS_BSDF_DIFFUSE(type)) eval->diffuse = value; else if(CLOSURE_IS_BSDF_GLOSSY(type)) eval->glossy = value; @@ -43,6 +44,8 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v eval->transmission = value; else if(CLOSURE_IS_BSDF_BSSRDF(type)) eval->subsurface = value; + else if(CLOSURE_IS_PHASE(type)) + eval->scatter = value; } else eval->diffuse = value; @@ -51,11 +54,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v #endif } -ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */ +#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500 +ccl_device_inline +#else +ccl_device_noinline +#endif +void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) { #ifdef __PASSES__ if(eval->use_light_pass) { - if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type)) + if(CLOSURE_IS_BSDF_DIFFUSE(type)) eval->diffuse += value; else if(CLOSURE_IS_BSDF_GLOSSY(type)) eval->glossy += value; @@ -63,6 +72,8 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 eval->transmission += value; else if(CLOSURE_IS_BSDF_BSSRDF(type)) eval->subsurface += value; + else if(CLOSURE_IS_PHASE(type)) + eval->scatter += value; /* skipping transparent, this function is used by for eval(), will be zero then */ } @@ -81,7 +92,8 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) && is_zero(eval->glossy) && is_zero(eval->transmission) && is_zero(eval->transparent) - && is_zero(eval->subsurface); + && is_zero(eval->subsurface) + && is_zero(eval->scatter); } else return is_zero(eval->diffuse); @@ -98,6 +110,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value) eval->glossy *= value; eval->transmission *= value; eval->subsurface *= value; + eval->scatter *= value; /* skipping transparent, this function is used by for eval(), will be zero then */ } @@ -111,7 +124,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value) /* Path Radiance * * We accumulate different render passes separately. After summing at the end - * to get the combined result, it should be identical. We definte directly + * to get the combined result, it should be identical. We definite directly * visible as the first non-transparent hit, while indirectly visible are the * bounces after that. */ @@ -130,21 +143,25 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->color_glossy = make_float3(0.0f, 0.0f, 0.0f); L->color_transmission = make_float3(0.0f, 0.0f, 0.0f); L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->color_scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f); L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f); L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f); L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f); L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f); L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); @@ -174,14 +191,16 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throug L->path_glossy = bsdf_eval->glossy*value; L->path_transmission = bsdf_eval->transmission*value; L->path_subsurface = bsdf_eval->subsurface*value; + L->path_scatter = bsdf_eval->scatter*value; - *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface; + *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter; L->direct_throughput = *throughput; } else { /* transparent bounce before first hit, or indirectly visible through BSDF */ - float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + bsdf_eval->subsurface)*inverse_pdf; + float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + + bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf; *throughput *= sum; } } @@ -241,6 +260,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through L->direct_glossy += throughput*bsdf_eval->glossy*shadow; L->direct_transmission += throughput*bsdf_eval->transmission*shadow; L->direct_subsurface += throughput*bsdf_eval->subsurface*shadow; + L->direct_scatter += throughput*bsdf_eval->scatter*shadow; if(is_lamp) { L->shadow.x += shadow.x*shadow_fac; @@ -250,7 +270,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } else { /* indirectly visible lighting after BSDF bounce */ - float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface; + float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter; L->indirect += throughput*sum*shadow; } } @@ -291,12 +311,14 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) L->direct_glossy += L->path_glossy*L->direct_emission; L->direct_transmission += L->path_transmission*L->direct_emission; L->direct_subsurface += L->path_subsurface*L->direct_emission; + L->direct_scatter += L->path_scatter*L->direct_emission; L->indirect = safe_divide_color(L->indirect, L->direct_throughput); L->indirect_diffuse += L->path_diffuse*L->indirect; L->indirect_glossy += L->path_glossy*L->indirect; L->indirect_transmission += L->path_transmission*L->indirect; L->indirect_subsurface += L->path_subsurface*L->indirect; + L->indirect_scatter += L->path_scatter*L->indirect; } #endif } @@ -309,6 +331,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->indirect = make_float3(0.0f, 0.0f, 0.0f); @@ -327,8 +350,8 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi if(L->use_light_pass) { path_radiance_sum_indirect(L); - L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->emission; - L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface; + L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->direct_scatter + L->emission; + L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface + L->indirect_scatter; if(!kernel_data.background.transparent) L_direct += L->background; @@ -344,11 +367,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f); L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f); L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f); L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f); L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f); L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); L->emission = make_float3(0.0f, 0.0f, 0.0f); } @@ -368,6 +393,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->direct_glossy *= scale; L->direct_transmission *= scale; L->direct_subsurface *= scale; + L->direct_scatter *= scale; L->emission *= scale; L->background *= scale; } @@ -382,6 +408,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L->indirect_glossy *= scale; L->indirect_transmission *= scale; L->indirect_subsurface *= scale; + L->indirect_scatter *= scale; } /* Sum again, after clamping */ @@ -416,11 +443,13 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance L->direct_glossy += L_sample->direct_glossy*fac; L->direct_transmission += L_sample->direct_transmission*fac; L->direct_subsurface += L_sample->direct_subsurface*fac; + L->direct_scatter += L_sample->direct_scatter*fac; L->indirect_diffuse += L_sample->indirect_diffuse*fac; L->indirect_glossy += L_sample->indirect_glossy*fac; L->indirect_transmission += L_sample->indirect_transmission*fac; L->indirect_subsurface += L_sample->indirect_subsurface*fac; + L->indirect_scatter += L_sample->indirect_scatter*fac; L->emission += L_sample->emission*fac; L->background += L_sample->background*fac; diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp index 354214c406e..e7ff21a6f09 100644 --- a/intern/cycles/kernel/kernel_avx.cpp +++ b/intern/cycles/kernel/kernel_avx.cpp @@ -24,6 +24,7 @@ #define __KERNEL_SSE3__ #define __KERNEL_SSSE3__ #define __KERNEL_SSE41__ +#define __KERNEL_AVX__ #endif #include "util_optimization.h" @@ -37,7 +38,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -67,9 +68,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float /* Shader Evaluate */ -void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp new file mode 100644 index 00000000000..cb1662bbfbe --- /dev/null +++ b/intern/cycles/kernel/kernel_avx2.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +#define __KERNEL_SSE2__ +#define __KERNEL_SSE3__ +#define __KERNEL_SSSE3__ +#define __KERNEL_SSE41__ +#define __KERNEL_AVX__ +#define __KERNEL_AVX2__ +#endif + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + +#include "kernel.h" +#include "kernel_compat_cpu.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_film.h" +#include "kernel_path.h" +#include "kernel_bake.h" + +CCL_NAMESPACE_BEGIN + +/* Path Tracing */ + +void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride) +{ +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) + kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + else +#endif + kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); +} + +/* Film */ + +void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +/* Shader Evaluate */ + +void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) +{ + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); +} + +CCL_NAMESPACE_END +#else + +/* needed for some linkers in combination with scons making empty compilation unit in a library */ +void __dummy_function_cycles_avx2(void); +void __dummy_function_cycles_avx2(void) {} + +#endif diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_bake.h index b8c64af658f..a1ec080e3d3 100644 --- a/intern/cycles/kernel/kernel_displace.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -17,65 +17,125 @@ CCL_NAMESPACE_BEGIN ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng, - bool is_combined, bool is_ao, bool is_sss) + const bool is_combined, const bool is_ao, const bool is_sss, int sample) { - int samples = kernel_data.integrator.aa_samples; - /* initialize master radiance accumulator */ kernel_assert(kernel_data.film.use_light_pass); path_radiance_init(L, kernel_data.film.use_light_pass); - /* take multiple samples */ - for(int sample = 0; sample < samples; sample++) { - PathRadiance L_sample; - PathState state; - Ray ray; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + PathRadiance L_sample; + PathState state; + Ray ray; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + bool is_sss_sample = is_sss; - /* init radiance */ - path_radiance_init(&L_sample, kernel_data.film.use_light_pass); + /* init radiance */ + path_radiance_init(&L_sample, kernel_data.film.use_light_pass); - /* init path state */ - path_state_init(kg, &state, &rng, sample); - state.num_samples = samples; + /* init path state */ + path_state_init(kg, &state, &rng, sample, NULL); - /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + /* evaluate surface shader */ + float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); + shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - /* TODO, disable the closures we won't need */ + /* TODO, disable the closures we won't need */ + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { + /* regular path tracer */ +#endif /* sample ambient occlusion */ if(is_combined || is_ao) { kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput); } - /* sample subsurface scattering */ - if((is_combined || is_sss) && (sd->flag & SD_BSSRDF)) { #ifdef __SUBSURFACE__ + /* sample subsurface scattering */ + if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput)) - is_sss = true; -#endif + is_sss_sample = true; } +#endif /* sample light and BSDF */ - if((!is_sss) && (!is_ao)) { - if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if((!is_sss_sample) && (!is_ao)) { + + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); + path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + } + + kernel_path_surface_connect_light(kg, &rng, sd, throughput, &state, &L_sample); + + if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample); + kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); path_radiance_reset_indirect(&L_sample); } } +#ifdef __BRANCHED_PATH__ + } + else { + /* branched path tracer */ + + /* sample ambient occlusion */ + if(is_combined || is_ao) { + kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput); + } + +#ifdef __SUBSURFACE__ + /* sample subsurface scattering */ + if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) { + /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ + kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, throughput); + } +#endif + + /* sample light and BSDF */ + if((!is_sss_sample) && (!is_ao)) { + + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); + path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + } + +#if defined(__EMISSION__) + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, &rng, + sd, &state, throughput, 1.0f, &L_sample, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, &rng, + sd, throughput, 1.0f, &state, &L_sample); + } + } +#endif + + /* accumulate into master L */ + path_radiance_accum_sample(L, &L_sample, 1); +} - /* accumulate into master L */ - path_radiance_accum_sample(L, &L_sample, samples); +ccl_device bool is_aa_pass(ShaderEvalType type) +{ + switch(type) { + case SHADER_EVAL_UV: + case SHADER_EVAL_NORMAL: + return false; + default: + return true; } } @@ -99,7 +159,21 @@ ccl_device bool is_light_pass(ShaderEvalType type) } } -ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i) +#if 0 +ccl_device_inline float bake_clamp_mirror_repeat(float u) +{ + /* use mirror repeat (like opengl texture) so that if the barycentric + * coordinate goes past the end of the triangle it is not always clamped + * to the same value, gives ugly patterns */ + float fu = floorf(u); + u = u - fu; + + return (((int)fu) & 1)? 1.0f - u: u; +} +#endif + +ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, + ShaderEvalType type, int i, int offset, int sample) { ShaderData sd; uint4 in = input[i * 2]; @@ -121,10 +195,28 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, float dvdx = __uint_as_float(diff.z); float dvdy = __uint_as_float(diff.w); + int num_samples = kernel_data.integrator.aa_samples; + + /* random number generator */ + RNG rng = cmj_hash(offset + i, 0); + +#if 0 + uint rng_state = cmj_hash(i, 0); + float filter_x, filter_y; + path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y); + + /* subpixel u/v offset */ + if(sample > 0) { + u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f)); + v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f)); + } +#endif + + /* triangle */ int shader; float3 P, Ng; - triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader); + triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); /* dummy initilizations copied from SHADER_EVAL_DISPLACE */ float3 I = Ng; @@ -147,12 +239,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, sd.dv.dx = dvdx; sd.dv.dy = dvdy; + /* light passes */ if(is_light_pass(type)) { - RNG rng = cmj_hash(i, 0); - compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED), - (type == SHADER_EVAL_AO), - (type == SHADER_EVAL_SUBSURFACE_DIRECT || - type == SHADER_EVAL_SUBSURFACE_INDIRECT)); + compute_light_pass(kg, &sd, &L, rng, + (type == SHADER_EVAL_COMBINED), + (type == SHADER_EVAL_AO), + (type == SHADER_EVAL_SUBSURFACE_DIRECT || + type == SHADER_EVAL_SUBSURFACE_INDIRECT), + sample); } switch (type) { @@ -307,17 +401,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } /* write output */ - output[i] = make_float4(out.x, out.y, out.z, 1.0f); - return; + float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f; + + if(sample == 0) + output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac; + else + output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac; } -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i) +ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample) { - if(type >= SHADER_EVAL_BAKE) { - kernel_bake_evaluate(kg, input, output, type, i); - return; - } - ShaderData sd; uint4 in = input[i]; float3 out; @@ -363,7 +456,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu } /* write output */ - output[i] = make_float4(out.x, out.y, out.z, 0.0f); + if(sample == 0) + output[i] = make_float4(out.x, out.y, out.z, 0.0f); + else + output[i] += make_float4(out.x, out.y, out.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index 7fc66a9fdee..5c83358a56d 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -21,16 +21,22 @@ CCL_NAMESPACE_BEGIN ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) { float blades = kernel_data.cam.blades; + float2 bokeh; if(blades == 0.0f) { /* sample disk */ - return concentric_sample_disk(u, v); + bokeh = concentric_sample_disk(u, v); } else { /* sample polygon */ float rotation = kernel_data.cam.bladesrotation; - return regular_polygon_sample(blades, rotation, u, v); + bokeh = regular_polygon_sample(blades, rotation, u, v); } + + /* anamorphic lens bokeh */ + bokeh.x *= kernel_data.cam.inv_aperture_ratio; + + return bokeh; } ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) @@ -183,7 +189,8 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* calculate orthonormal coordinates perpendicular to D */ float3 U, V; - make_orthonormals(D, &U, &V); + U = normalize(make_float3(1.0f, 0.0f, 0.0f) - D.x * D); + V = normalize(cross(D, U)); /* update ray for effect of lens */ ray->P = U * lensuv.x + V * lensuv.y; @@ -262,6 +269,20 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P) return len(P - camP); } +ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P) +{ + Transform cameratoworld = kernel_data.cam.cameratoworld; + + if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) { + float3 camD = make_float3(cameratoworld.x.z, cameratoworld.y.z, cameratoworld.z.z); + return -camD; + } + else { + float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w); + return normalize(camP - P); + } +} + ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P) { if(kernel_data.cam.type != CAMERA_PANORAMA) { diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index d027bb62ebe..37cba03ff97 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -25,6 +25,13 @@ #include "util_half.h" #include "util_types.h" +/* On 64bit linux single precision exponent is really slow comparing to the + * double precision version, even with float<->double conversion involved. + */ +#if !defined(__KERNEL_GPU__) && defined(__linux__) && defined(__x86_64__) +# define expf(x) ((float)exp((double)(x))) +#endif + CCL_NAMESPACE_BEGIN /* Assertions inside the kernel only work for the CPU device, so we wrap it in @@ -44,16 +51,16 @@ template<typename T> struct texture { } #if 0 - ccl_always_inline __m128 fetch_m128(int index) + ccl_always_inline ssef fetch_ssef(int index) { kernel_assert(index >= 0 && index < width); - return ((__m128*)data)[index]; + return ((ssef*)data)[index]; } - ccl_always_inline __m128i fetch_m128i(int index) + ccl_always_inline ssei fetch_ssei(int index) { kernel_assert(index >= 0 && index < width); - return ((__m128i*)data)[index]; + return ((ssei*)data)[index]; } #endif @@ -144,6 +151,13 @@ template<typename T> struct texture_image { ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false) { + return interp_3d_ex(x, y, z, interpolation, periodic); + } + + ccl_always_inline float4 interp_3d_ex(float x, float y, float z, + int interpolation = INTERPOLATION_LINEAR, + bool periodic = false) + { if(UNLIKELY(!data)) return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -167,7 +181,7 @@ template<typename T> struct texture_image { return read(data[ix + iy*width + iz*width*height]); } - else { + else if(interpolation == INTERPOLATION_LINEAR) { float tx = frac(x*(float)width - 0.5f, &ix); float ty = frac(y*(float)height - 0.5f, &iy); float tz = frac(z*(float)depth - 0.5f, &iz); @@ -205,6 +219,93 @@ template<typename T> struct texture_image { return r; } + else { + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + if(periodic) { + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + } + else { + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define SET_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_SPLINE_WEIGHTS(u, tx); + SET_SPLINE_WEIGHTS(v, ty); + SET_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA +#undef SET_SPLINE_WEIGHTS + } } ccl_always_inline void dimensions_set(int width_, int height_, int depth_) @@ -232,11 +333,12 @@ typedef texture_image<uchar4> texture_image_uchar4; /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) -#define kernel_tex_fetch_m128(tex, index) (kg->tex.fetch_m128(index)) -#define kernel_tex_fetch_m128i(tex, index) (kg->tex.fetch_m128i(index)) +#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) +#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y)) #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z)) +#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation)) #define kernel_data (kg->__data) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e4c20d26ff1..f14f3262274 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -75,12 +75,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; /* Use fast math functions */ -#define cosf(x) __cosf(((float)x)) -#define sinf(x) __sinf(((float)x)) -#define powf(x, y) __powf(((float)x), ((float)y)) -#define tanf(x) __tanf(((float)x)) -#define logf(x) __logf(((float)x)) -#define expf(x) __expf(((float)x)) +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) #endif /* __KERNEL_COMPAT_CUDA_H__ */ - diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 8346b09619e..58031a41b78 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -24,14 +24,6 @@ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END -#ifdef __KERNEL_OPENCL_AMD__ -#define __CL_NO_FLOAT3__ -#endif - -#ifdef __CL_NO_FLOAT3__ -#define float3 float4 -#endif - #ifdef __CL_NOINLINE__ #define ccl_noinline __attribute__((noinline)) #else @@ -68,51 +60,51 @@ #ifdef make_int4 #undef make_int4 #endif +#ifdef make_uchar4 +#undef make_uchar4 +#endif #define make_float2(x, y) ((float2)(x, y)) -#ifdef __CL_NO_FLOAT3__ -#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f)) -#else #define make_float3(x, y, z) ((float3)(x, y, z)) -#endif #define make_float4(x, y, z, w) ((float4)(x, y, z, w)) #define make_int2(x, y) ((int2)(x, y)) #define make_int3(x, y, z) ((int3)(x, y, z)) #define make_int4(x, y, z, w) ((int4)(x, y, z, w)) +#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w)) /* math functions */ #define __uint_as_float(x) as_float(x) #define __float_as_uint(x) as_uint(x) #define __int_as_float(x) as_float(x) #define __float_as_int(x) as_int(x) -#define powf(x, y) pow(((float)x), ((float)y)) -#define fabsf(x) fabs(((float)x)) -#define copysignf(x, y) copysign(((float)x), ((float)y)) -#define asinf(x) asin(((float)x)) -#define acosf(x) acos(((float)x)) -#define atanf(x) atan(((float)x)) -#define floorf(x) floor(((float)x)) -#define ceilf(x) ceil(((float)x)) -#define hypotf(x, y) hypot(((float)x), ((float)y)) -#define atan2f(x, y) atan2(((float)x), ((float)y)) -#define fmaxf(x, y) fmax(((float)x), ((float)y)) -#define fminf(x, y) fmin(((float)x), ((float)y)) -#define fmodf(x, y) fmod((float)x, (float)y) +#define powf(x, y) pow(((float)(x)), ((float)(y))) +#define fabsf(x) fabs(((float)(x))) +#define copysignf(x, y) copysign(((float)(x)), ((float)(y))) +#define asinf(x) asin(((float)(x))) +#define acosf(x) acos(((float)(x))) +#define atanf(x) atan(((float)(x))) +#define floorf(x) floor(((float)(x))) +#define ceilf(x) ceil(((float)(x))) +#define hypotf(x, y) hypot(((float)(x)), ((float)(y))) +#define atan2f(x, y) atan2(((float)(x)), ((float)(y))) +#define fmaxf(x, y) fmax(((float)(x)), ((float)(y))) +#define fminf(x, y) fmin(((float)(x)), ((float)(y))) +#define fmodf(x, y) fmod((float)(x), (float)(y)) #ifndef __CL_USE_NATIVE__ -#define sinf(x) native_sin(((float)x)) -#define cosf(x) native_cos(((float)x)) -#define tanf(x) native_tan(((float)x)) -#define expf(x) native_exp(((float)x)) -#define sqrtf(x) native_sqrt(((float)x)) -#define logf(x) native_log(((float)x)) +#define sinf(x) native_sin(((float)(x))) +#define cosf(x) native_cos(((float)(x))) +#define tanf(x) native_tan(((float)(x))) +#define expf(x) native_exp(((float)(x))) +#define sqrtf(x) native_sqrt(((float)(x))) +#define logf(x) native_log(((float)(x))) #else -#define sinf(x) sin(((float)x)) -#define cosf(x) cos(((float)x)) -#define tanf(x) tan(((float)x)) -#define expf(x) exp(((float)x)) -#define sqrtf(x) sqrt(((float)x)) -#define logf(x) log(((float)x)) +#define sinf(x) sin(((float)(x))) +#define cosf(x) cos(((float)(x))) +#define tanf(x) tan(((float)(x))) +#define expf(x) exp(((float)(x))) +#define sqrtf(x) sqrt(((float)(x))) +#define logf(x) log(((float)(x))) #endif /* data lookup defines */ diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h new file mode 100644 index 00000000000..bf1bc0e9db8 --- /dev/null +++ b/intern/cycles/kernel/kernel_debug.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void debug_data_init(DebugData *debug_data) +{ + debug_data->num_bvh_traversal_steps = 0; +} + +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathState *state, + DebugData *debug_data, + int sample) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASS_BVH_TRAVERSAL_STEPS) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps, + sample, + debug_data->num_bvh_traversal_steps); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index deffa7f2ba2..4b2bb723ab6 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -63,32 +63,18 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, return eval; } -ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex, - float randt, float randu, float randv, Ray *ray, BsdfEval *eval, - bool *is_lamp, int bounce, int transparent_bounce) +ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, + LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp, + int bounce, int transparent_bounce) { - LightSample ls; - -#ifdef __BRANCHED_PATH__ - if(lindex != LAMP_NONE) { - /* sample position on a specified light */ - light_select(kg, lindex, randu, randv, sd->P, &ls); - } - else -#endif - { - /* sample a light and position on int */ - light_sample(kg, randt, randu, randv, sd->time, sd->P, &ls); - } - - if(ls.pdf == 0.0f) + if(ls->pdf == 0.0f) return false; /* todo: implement */ differential3 dD = differential3_zero(); /* evaluate closure */ - float3 light_eval = direct_emissive_eval(kg, &ls, -ls.D, dD, ls.t, sd->time, bounce, transparent_bounce); + float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce); if(is_zero(light_eval)) return false; @@ -98,49 +84,51 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int #ifdef __VOLUME__ if(sd->prim != PRIM_NONE) - shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); else - shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf); #else - shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf); + shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); #endif - if(ls.shader & SHADER_USE_MIS) { + if(ls->shader & SHADER_USE_MIS) { /* multiple importance sampling */ - float mis_weight = power_heuristic(ls.pdf, bsdf_pdf); + float mis_weight = power_heuristic(ls->pdf, bsdf_pdf); light_eval *= mis_weight; } - bsdf_eval_mul(eval, light_eval/ls.pdf); + bsdf_eval_mul(eval, light_eval/ls->pdf); #ifdef __PASSES__ /* use visibility flag to skip lights */ - if(ls.shader & SHADER_EXCLUDE_ANY) { - if(ls.shader & SHADER_EXCLUDE_DIFFUSE) + if(ls->shader & SHADER_EXCLUDE_ANY) { + if(ls->shader & SHADER_EXCLUDE_DIFFUSE) eval->diffuse = make_float3(0.0f, 0.0f, 0.0f); - if(ls.shader & SHADER_EXCLUDE_GLOSSY) + if(ls->shader & SHADER_EXCLUDE_GLOSSY) eval->glossy = make_float3(0.0f, 0.0f, 0.0f); - if(ls.shader & SHADER_EXCLUDE_TRANSMIT) + if(ls->shader & SHADER_EXCLUDE_TRANSMIT) eval->transmission = make_float3(0.0f, 0.0f, 0.0f); + if(ls->shader & SHADER_EXCLUDE_SCATTER) + eval->scatter = make_float3(0.0f, 0.0f, 0.0f); } #endif if(bsdf_eval_is_zero(eval)) return false; - if(ls.shader & SHADER_CAST_SHADOW) { + if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(sd->Ng, ls.D) < 0.0f); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); - if(ls.t == FLT_MAX) { + if(ls->t == FLT_MAX) { /* distant light */ - ray->D = ls.D; - ray->t = ls.t; + ray->D = ls->D; + ray->t = ls->t; } else { /* other lights, avoid self-intersection */ - ray->D = ray_offset(ls.P, ls.Ng) - ray->P; + ray->D = ray_offset(ls->P, ls->Ng) - ray->P; ray->D = normalize_len(ray->D, &ray->t); } @@ -153,7 +141,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int } /* return if it's a lamp for shadow pass */ - *is_lamp = (ls.prim == PRIM_NONE && ls.type != LIGHT_BACKGROUND); + *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND); return true; } @@ -201,13 +189,25 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st if(ls.shader & SHADER_EXCLUDE_ANY) { if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) || - ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT))) + ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || + ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) continue; } #endif float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce); +#ifdef __VOLUME__ + if(state->volume_stack[0].shader != SHADER_NONE) { + /* shadow attenuation */ + Ray volume_ray = *ray; + volume_ray.t = ls.t; + float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f); + kernel_volume_shadow(kg, state, &volume_ray, &volume_tp); + L *= volume_tp; + } +#endif + if(!(state->flag & PATH_RAY_MIS_SKIP)) { /* multiple importance sampling, get regular light pdf, * and compute weight with respect to BSDF pdf */ @@ -234,7 +234,8 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) || ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || - ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA))) + ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) || + ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) return make_float3(0.0f, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 7a850844bf2..2a5b7689e57 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -14,6 +14,8 @@ * limitations under the License */ +/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */ + CCL_NAMESPACE_BEGIN /* "Correlated Multi-Jittered Sampling" @@ -35,8 +37,16 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b) /* a must be > 0 and b must be > 1 */ ccl_device_inline int cmj_fast_div_pow2(int a, int b) { -#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER) + kernel_assert(a > 0); + kernel_assert(b > 1); +#if defined(__KERNEL_SSE2__) +# ifdef _MSC_VER + unsigned long ctz; + _BitScanForward(&ctz, b); + return a >> ctz; +# else return a >> __builtin_ctz(b); +# endif #else return a/b; #endif @@ -44,8 +54,15 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b) ccl_device_inline uint cmj_w_mask(uint w) { -#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER) + kernel_assert(w > 1); +#if defined(__KERNEL_SSE2__) +# ifdef _MSC_VER + unsigned long leading_zero; + _BitScanReverse(&leading_zero, w); + return ((1 << (1 + leading_zero)) - 1); +# else return ((1 << (32 - __builtin_clz(w))) - 1); +# endif #else w |= w >> 1; w |= w >> 2; @@ -165,7 +182,8 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) smodm = cmj_fast_mod_pow2(s, m); } else { - sdivm = float_to_int(s * invm); + /* Doing s*inmv gives precision issues here. */ + sdivm = s / m; smodm = s - sdivm*m; } diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index ac432d3fe04..b18f67ad524 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -27,7 +27,7 @@ typedef struct LightSample { float pdf; /* light sampling probability density function */ float eval_fac; /* intensity multiplier */ int object; /* object id for triangle/curve lights */ - int prim; /* primitive id for triangle/curve ligths */ + int prim; /* primitive id for triangle/curve lights */ int shader; /* shader id */ int lamp; /* lamp id */ LightType type; /* type of light */ @@ -167,12 +167,137 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo return disk_light_sample(normalize(P - center), randu, randv)*radius; } -ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv) +/* Uses the following paper: + * + * Carlos Urena et al. + * An Area-Preserving Parametrization for Spherical Rectangles. + * + * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf + */ +ccl_device float3 area_light_sample(float3 P, + float3 light_p, + float3 axisu, float3 axisv, + float randu, float randv, + float *pdf) { - randu = randu - 0.5f; - randv = randv - 0.5f; + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if(z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float z0sq = z0 * z0; + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + float y0sq = y0 * y0; + float y1sq = y1 * y1; + /* Create vectors to four vertices. */ + float3 v00 = make_float3(x0, y0, z0); + float3 v01 = make_float3(x0, y1, z0); + float3 v10 = make_float3(x1, y0, z0); + float3 v11 = make_float3(x1, y1, z0); + /* Compute normals to edges. */ + float3 n0 = normalize(cross(v00, v10)); + float3 n1 = normalize(cross(v10, v11)); + float3 n2 = normalize(cross(v11, v01)); + float3 n3 = normalize(cross(v01, v00)); + /* Compute internal angles (gamma_i). */ + float g0 = acosf(-dot(n0, n1)); + float g1 = acosf(-dot(n1, n2)); + float g2 = acosf(-dot(n2, n3)); + float g3 = acosf(-dot(n3, n0)); + /* Compute predefined constants. */ + float b0 = n0.z; + float b1 = n2.z; + float b0sq = b0 * b0; + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + + /* Compute cu. */ + float au = randu * S + k; + float fu = (cosf(au) * b0 - b1) / sinf(au); + float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); + cu = clamp(cu, -1.0f, 1.0f); + /* Compute xu. */ + float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + xu = clamp(xu, x0, x1); + /* Compute yv. */ + float d = sqrtf(xu * xu + z0sq); + float h0 = y0 / sqrtf(d * d + y0sq); + float h1 = y1 / sqrtf(d * d + y1sq); + float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; + float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; + + *pdf = 1.0f / S; + + /* Transform (xu, yv, z0) to world coords. */ + return P + xu * x + yv * y + z0 * z; +} - return axisu*randu + axisv*randv; +/* TODO(sergey): This is actually a duplicated code from above, but how to avoid + * this without having some nasty function with loads of parameters? + */ +ccl_device float area_light_pdf(float3 P, + float3 light_p, + float3 axisu, float3 axisv) +{ + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if(z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + /* Create vectors to four vertices. */ + float3 v00 = make_float3(x0, y0, z0); + float3 v01 = make_float3(x0, y1, z0); + float3 v10 = make_float3(x1, y0, z0); + float3 v11 = make_float3(x1, y1, z0); + /* Compute normals to edges. */ + float3 n0 = normalize(cross(v00, v10)); + float3 n1 = normalize(cross(v10, v11)); + float3 n2 = normalize(cross(v11, v01)); + float3 n3 = normalize(cross(v01, v00)); + /* Compute internal angles (gamma_i). */ + float g0 = acosf(-dot(n0, n1)); + float g1 = acosf(-dot(n1, n2)); + float g2 = acosf(-dot(n2, n3)); + float g3 = acosf(-dot(n3, n0)); + /* Compute predefined constants. */ + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + return 1.0f / S; } ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls) @@ -276,6 +401,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); ls->eval_fac *= spot_light_attenuation(data1, data2, ls); } + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } else { /* area light */ @@ -286,18 +412,22 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp, float3 axisv = make_float3(data2.y, data2.z, data2.w); float3 D = make_float3(data3.y, data3.z, data3.w); - ls->P += area_light_sample(axisu, axisv, randu, randv); + ls->P = area_light_sample(P, ls->P, + axisu, axisv, + randu, randv, + &ls->pdf); + ls->Ng = D; ls->D = normalize_len(ls->P - P, &ls->t); float invarea = data2.x; - ls->eval_fac = 0.25f*invarea; - ls->pdf = invarea; + + if(dot(ls->D, D) > 0.0f) + ls->pdf = 0.0f; } ls->eval_fac *= kernel_data.integrator.inv_pdf_lights; - ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } } @@ -355,8 +485,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->D = D; ls->t = FLT_MAX; + /* compute pdf */ float invarea = data1.w; ls->pdf = invarea/(costheta*costheta*costheta); + if(ls->t != FLT_MAX) + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); + ls->eval_fac = ls->pdf; } else if(type == LIGHT_POINT || type == LIGHT_SPOT) { @@ -386,6 +520,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, if(ls->eval_fac == 0.0f) return false; } + + /* compute pdf */ + if(ls->t != FLT_MAX) + ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); } else if(type == LIGHT_AREA) { /* area light */ @@ -412,16 +550,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->D = D; ls->Ng = Ng; - ls->pdf = invarea; - ls->eval_fac = 0.25f*ls->pdf; + ls->pdf = area_light_pdf(P, ls->P, axisu, axisv); + ls->eval_fac = 0.25f*invarea; } else return false; - /* compute pdf */ - if(ls->t != FLT_MAX) - ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t); - return true; } @@ -457,7 +591,7 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, v = randv*randu; /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader); + triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; @@ -546,11 +680,6 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index) return __float_as_int(data3.x); } -ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls) -{ - lamp_light_sample(kg, index, randu, randv, P, ls); -} - ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt) { /* sample index */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index a80a0033712..c03229f0a3a 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -29,7 +29,6 @@ #include "kernel_accumulate.h" #include "kernel_shader.h" #include "kernel_light.h" -#include "kernel_emission.h" #include "kernel_passes.h" #ifdef __SUBSURFACE__ @@ -42,177 +41,15 @@ #include "kernel_path_state.h" #include "kernel_shadow.h" +#include "kernel_emission.h" +#include "kernel_path_surface.h" +#include "kernel_path_volume.h" -CCL_NAMESPACE_BEGIN - -#ifdef __VOLUME__ - -ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray, - float num_samples_adjust) -{ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, *throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -#endif - - /* sample phase function */ - float phase_pdf; - BsdfEval phase_eval; - float3 phase_omega_in; - differential3 phase_domega_in; - float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); - int label; - - label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, - &phase_omega_in, &phase_domega_in, &phase_pdf); - - if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); - - /* set labels */ - state->ray_pdf = phase_pdf; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif - state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf); - - /* update path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = sd->P; - ray->D = phase_omega_in; - ray->t = FLT_MAX; - -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = phase_domega_in; -#endif - - return true; -} - +#ifdef __KERNEL_DEBUG__ +#include "kernel_debug.h" #endif -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) - -ccl_device void kernel_branched_path_integrate_direct_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights) -{ - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(sample_all_lights) { - /* lamp sampling */ - for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { - int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); - float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); - - if(kernel_data.integrator.pdf_triangles != 0.0f) - num_samples_inv *= 0.5f; - - for(int j = 0; j < num_samples; j++) { - float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - - if(direct_emission(kg, sd, i, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); - } - } - } - } - - /* mesh light sampling */ - if(kernel_data.integrator.pdf_triangles != 0.0f) { - int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); - float num_samples_inv = num_samples_adjust/num_samples; - - if(kernel_data.integrator.num_all_lights) - num_samples_inv *= 0.5f; - - for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); - float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - - /* only sample triangle lights */ - if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); - } - } - } - } - } - else { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - /* sample random light */ - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -} - -#endif +CCL_NAMESPACE_BEGIN ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, float3 throughput, int num_samples, PathState state, PathRadiance *L) @@ -222,11 +59,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, /* intersect scene */ Intersection isect; uint visibility = path_state_ray_visibility(kg, &state); -#ifdef __HAIR__ bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); -#else - bool hit = scene_intersect(kg, &ray, visibility, &isect); -#endif #ifdef __LAMP_MIS__ if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { @@ -255,15 +88,81 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; - ShaderData volume_sd; - VolumeIntegrateResult result = kernel_volume_integrate(kg, &state, - &volume_sd, &volume_ray, L, &throughput, rng); + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f)) - continue; - else - break; +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = kernel_data.integrator.sample_all_lights_indirect; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray)) + continue; + else + break; + } +#endif } } #endif @@ -281,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, /* setup shading */ ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); - float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT); #ifdef __BRANCHED_PATH__ shader_merge_closures(&sd); @@ -315,7 +214,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -383,187 +282,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray, #if defined(__EMISSION__) && defined(__BRANCHED_PATH__) if(kernel_data.integrator.use_direct_light) { bool all = kernel_data.integrator.sample_all_lights_indirect; - kernel_branched_path_integrate_direct_lighting(kg, rng, &sd, &state, throughput, 1.0f, L, all); - } -#endif - - /* no BSDF? we can stop here */ - if(sd.flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - break; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state.ray_t = 0.0f; -#endif - state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, &state, label); - - /* setup ray */ - ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng); - ray.D = bsdf_omega_in; - ray.t = FLT_MAX; -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -#endif + kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all); } -#ifdef __VOLUME__ - else if(sd.flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ - - /* update path state, count as transparent */ - path_state_next(kg, &state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray.P = ray_offset(sd.P, -sd.Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; #endif - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); - } -#endif - else { - /* no bsdf or volume? we're done */ + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; - } - } -} - -ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) -{ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd->flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -#endif - - if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(L, *throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); - } - } - } - } -#endif - - /* no BSDF? we can stop here */ - if(sd->flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state->ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif - state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); - ray->D = bsdf_omega_in; - - if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ - else - ray->t = FLT_MAX; - -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); -#endif - return true; - } -#ifdef __VOLUME__ - else if(sd->flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ - - /* update path state, count as transparent */ - path_state_next(kg, state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(sd->P, -sd->Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; -#endif - - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); - return true; - } -#endif - else { - /* no bsdf or volume? */ - return false; } } @@ -601,7 +325,68 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance * } } +ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput) +{ + int num_samples = kernel_data.integrator.ao_samples; + float num_samples_inv = 1.0f/num_samples; + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + float3 ao_alpha = shader_bsdf_alpha(kg, sd); + + for(int j = 0; j < num_samples; j++) { + float bsdf_u, bsdf_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float3 ao_D; + float ao_pdf; + + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray light_ray; + float3 ao_shadow; + + light_ray.P = ray_offset(sd->P, sd->Ng); + light_ray.D = ao_D; + light_ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + light_ray.dP = sd->dP; + light_ray.dD = differential3_zero(); + + if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) + path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + } + } +} + #ifdef __SUBSURFACE__ + +#ifdef __VOLUME__ +ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg, + Ray *ray, + VolumeStack *stack) +{ + kernel_assert(kernel_data.integrator.use_volumes); + + Ray volume_ray = *ray; + Intersection isect; + + while(scene_intersect_volume(kg, &volume_ray, &isect)) + { + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); + kernel_volume_stack_enter_exit(kg, &sd, stack); + + /* Move ray forward. */ + volume_ray.P = ray_offset(sd.P, -sd.Ng); + volume_ray.t -= sd.ray_length; + } +} +#endif + ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput) { float bssrdf_probability; @@ -618,6 +403,11 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false); +#ifdef __VOLUME__ + Ray volume_ray = *ray; + bool need_update_volume_stack = kernel_data.integrator.use_volumes && + sd->flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* compute lighting with the BSDF closure */ for(int hit = 0; hit < num_hits; hit++) { @@ -627,12 +417,30 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR; hit_state.rng_offset += PRNG_BOUNCE_NUM; + + kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L); - if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) { + if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) { #ifdef __LAMP_MIS__ hit_state.ray_t = 0.0f; #endif +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + volume_ray.D = normalize_len(hit_ray.P - volume_ray.P, + &volume_ray.t); + + kernel_path_subsurface_update_volume_stack( + kg, + &volume_ray, + hit_state.volume_stack); + + /* Move volume ray forward. */ + volume_ray.P = hit_ray.P; + } +#endif + kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L); /* for render passes, sum and reset indirect light pass variables @@ -657,7 +465,12 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, path_radiance_init(&L, kernel_data.film.use_light_pass); PathState state; - path_state_init(kg, &state, rng, sample); + path_state_init(kg, &state, rng, sample, &ray); + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; + debug_data_init(&debug_data); +#endif /* path iteration */ for(;;) { @@ -682,7 +495,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect); + bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + } #endif #ifdef __LAMP_MIS__ @@ -712,15 +531,81 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; - ShaderData volume_sd; - VolumeIntegrateResult result = kernel_volume_integrate(kg, &state, - &volume_sd, &volume_ray, &L, &throughput, rng); + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f)) - continue; - else - break; +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = false; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } +#endif } } #endif @@ -748,7 +633,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, /* setup shading */ ShaderData sd; shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce); - float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN); /* holdout */ @@ -803,7 +688,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -826,134 +711,33 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample, break; } #endif - - /* Same as kernel_path_integrate_lighting(kg, rng, &sd, &throughput, &state, &L, &ray), - but for CUDA the function call is slower. */ -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - /* sample illumination from lights to find path contribution */ - if(sd.flag & SD_BSDF_HAS_EVAL) { - float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v); - - Ray light_ray; - BsdfEval L_light; - bool is_lamp; - -#ifdef __OBJECT_MOTION__ - light_ray.time = sd.time; -#endif - - if(direct_emission(kg, &sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce, state.transparent_bounce)) { - /* trace shadow ray */ - float3 shadow; - - if(!shadow_blocked(kg, &state, &light_ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(&L, throughput, &L_light, shadow, 1.0f, state.bounce, is_lamp); - } - } - } - } -#endif - - if(sd.flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - break; - - /* modify throughput */ - path_radiance_bsdf_bounce(&L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label); - - /* set labels */ - if(!(label & LABEL_TRANSPARENT)) { - state.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state.ray_t = 0.0f; -#endif - state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, &state, label); - - /* setup ray */ - ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng); - ray.D = bsdf_omega_in; - -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -#endif - } -#ifdef __VOLUME__ - else if(sd.flag & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ + /* direct lighting */ + kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L); - /* update path state, count as transparent */ - path_state_next(kg, &state, LABEL_TRANSPARENT); - - /* setup ray position, direction stays unchanged */ - ray.P = ray_offset(sd.P, -sd.Ng); -#ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; -#endif - - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); - } -#endif - else { - /* no bsdf or volume? we're done */ + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) break; - } - - /* adjust ray distance for clipping */ - if(state.bounce == 0) - ray.t -= sd.ray_length; /* clipping works through transparent */ - else - ray.t = FLT_MAX; } float3 L_sum = path_radiance_clamp_and_sum(kg, &L); kernel_write_light_passes(kg, buffer, &L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); +#endif + return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } #ifdef __BRANCHED_PATH__ -ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg, +/* branched path tracing: bounce off surface and integrate indirect light */ +ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust, - PathState *state, PathRadiance *L, ccl_global float *buffer) + PathState *state, PathRadiance *L) { -#ifdef __EMISSION__ - if(kernel_data.integrator.use_direct_light) { - bool all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_integrate_direct_lighting(kg, rng, sd, state, throughput, num_samples_adjust, L, all); - } -#endif - for(int i = 0; i< sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; @@ -980,68 +764,102 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals * RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval; - float3 bsdf_omega_in; - differential3 bsdf_domega_in; - float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, &bsdf_rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, - &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + PathState ps = *state; + float3 tp = throughput; + Ray bsdf_ray; - if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray)) continue; - /* modify throughput */ - float3 tp = throughput; - path_radiance_bsdf_bounce(L, &tp, &bsdf_eval, bsdf_pdf, state->bounce, label); + kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); - /* modify path state */ - PathState ps = *state; - path_state_next(kg, &ps, label); + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } +} - /* setup ray */ - Ray bsdf_ray; +#ifdef __SUBSURFACE__ +ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, + ShaderData *sd, + PathRadiance *L, + PathState *state, + RNG *rng, + Ray *ray, + float3 throughput) +{ + for(int i = 0; i< sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; - bsdf_ray.P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); - bsdf_ray.D = bsdf_omega_in; - bsdf_ray.t = FLT_MAX; -#ifdef __RAY_DIFFERENTIALS__ - bsdf_ray.dP = sd->dP; - bsdf_ray.dD = bsdf_domega_in; -#endif -#ifdef __OBJECT_MOTION__ - bsdf_ray.time = sd->time; -#endif + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(*rng, i); + + state->flag |= PATH_RAY_BSSRDF_ANCESTOR; + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = 0; j < num_samples; j++) { + ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); #ifdef __VOLUME__ - /* enter/exit volume */ - if(label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack); + Ray volume_ray = *ray; + bool need_update_volume_stack = kernel_data.integrator.use_volumes && + sd->flag & SD_OBJECT_INTERSECTS_VOLUME; #endif - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + PathState hit_state = *state; - /* set MIS state */ - ps.min_ray_pdf = fminf(bsdf_pdf, FLT_MAX); - ps.ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - ps.ray_t = 0.0f; + path_state_branch(&hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng); + volume_ray.D = normalize_len(P - volume_ray.P, + &volume_ray.t); + + kernel_path_subsurface_update_volume_stack( + kg, + &volume_ray, + hit_state.volume_stack); + + /* Move volume ray forward. */ + volume_ray.P = P; + } #endif - kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L); +#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all); + } +#endif - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &bssrdf_sd[hit], throughput, num_samples_inv, + &hit_state, L); + } } + + state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR; } } +#endif ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) { @@ -1053,7 +871,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_radiance_init(&L, kernel_data.film.use_light_pass); PathState state; - path_state_init(kg, &state, rng, sample); + path_state_init(kg, &state, rng, sample, &ray); + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; + debug_data_init(&debug_data); +#endif for(;;) { /* intersect scene */ @@ -1077,7 +900,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, &ray, visibility, &isect); + bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data.num_bvh_traversal_steps += isect.num_traversal_steps; + } #endif #ifdef __VOLUME__ @@ -1085,10 +914,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in if(state.volume_stack[0].shader != SHADER_NONE) { Ray volume_ray = ray; volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); -#ifdef __KERNEL_CPU__ +#ifdef __VOLUME_DECOUPLED__ /* decoupled ray marching only supported on CPU */ - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); /* cache steps along volume for repeated sampling */ VolumeSegment volume_segment; @@ -1098,29 +928,45 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_volume_decoupled_record(kg, &state, &volume_ray, &volume_sd, &volume_segment, heterogeneous); - /* sample scattering */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; + /* direct light sampling */ + if(volume_segment.closure_flag & SD_SCATTER) { + volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); + bool all = kernel_data.integrator.sample_all_lights_direct; - PathState ps = state; - Ray pray = ray; - float3 tp = throughput; + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); + /* indirect light sampling */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - &ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - /* todo: use all-light sampling */ - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) { + for(int j = 0; j < num_samples; j++) { + /* workaround to fix correlation bug in T38710, can find better solution + * in random number generator later, for now this is done here to not impact + * performance of rendering without volumes */ + RNG tmp_rng = cmj_hash(*rng, state.rng_offset); + + PathState ps = state; + Ray pray = ray; + float3 tp = throughput; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + /* scatter sample. if we use distance sampling and take just one + * sample for direct and indirect light, we could share this + * computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + &ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); /* for render passes, sum and reset indirect light pass variables @@ -1150,18 +996,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in PathState ps = state; Ray pray = ray; ShaderData volume_sd; - float3 tp = throughput; + float3 tp = throughput * num_samples_inv; /* branch RNG state */ path_state_branch(&ps, j, num_samples); - VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps, - &volume_sd, &volume_ray, &L, &tp, rng); + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous); +#ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { - /* todo: use all-light sampling */ - if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) { - kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L); + /* todo: support equiangular, MIS and all light sampling. + * alternatively get decoupled ray marching working on the GPU */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L); + + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) { + kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ @@ -1169,6 +1019,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_radiance_reset_indirect(&L); } } +#endif } /* todo: avoid this calculation using decoupled ray marching */ @@ -1205,7 +1056,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* holdout */ #ifdef __HOLDOUT__ - if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK))) { + if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) { if(kernel_data.background.transparent) { float3 holdout_weight; @@ -1245,7 +1096,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -1257,90 +1108,33 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - int num_samples = kernel_data.integrator.ao_samples; - float num_samples_inv = 1.0f/num_samples; - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N); - float3 ao_alpha = shader_bsdf_alpha(kg, &sd); - - for(int j = 0; j < num_samples; j++) { - float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, &state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float3 ao_D; - float ao_pdf; - - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray light_ray; - float3 ao_shadow; - - light_ray.P = ray_offset(sd.P, sd.Ng); - light_ray.D = ao_D; - light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = sd.time; -#endif - light_ray.dP = sd.dP; - light_ray.dD = differential3_zero(); - - if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow)) - path_radiance_accum_ao(&L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state.bounce); - } - } + kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput); } #endif #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { - for(int i = 0; i< sd.num_closure; i++) { - ShaderClosure *sc = &sd.closure[i]; - - if(!CLOSURE_IS_BSSRDF(sc->type)) - continue; - - /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb); - int num_samples = kernel_data.integrator.subsurface_samples; - float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); - - state.flag |= PATH_RAY_BSSRDF_ANCESTOR; - - /* do subsurface scatter step with copy of shader data, this will - * replace the BSSRDF with a diffuse BSDF closure */ - for(int j = 0; j < num_samples; j++) { - ShaderData bssrdf_sd[BSSRDF_MAX_HITS]; - float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true); - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - PathState hit_state = state; - - path_state_branch(&hit_state, j, num_samples); - - kernel_branched_path_integrate_lighting(kg, rng, - &bssrdf_sd[hit], throughput, num_samples_inv, - &hit_state, &L, buffer); - } - } - - state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR; - } + kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state, + rng, &ray, throughput); } #endif if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { PathState hit_state = state; - /* lighting */ - kernel_branched_path_integrate_lighting(kg, rng, - &sd, throughput, 1.0f, &hit_state, &L, buffer); +#ifdef __EMISSION__ + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + bool all = kernel_data.integrator.sample_all_lights_direct; + kernel_branched_path_surface_connect_light(kg, rng, + &sd, &hit_state, throughput, 1.0f, &L, all); + } +#endif + + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, rng, + &sd, throughput, 1.0f, &hit_state, &L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -1353,6 +1147,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in ray.P = ray_offset(sd.P, -sd.Ng); ray.t -= sd.ray_length; /* clipping works through transparent */ + +#ifdef __RAY_DIFFERENTIALS__ + ray.dP = sd.dP; + ray.dD.dx = -sd.dI.dx; + ray.dD.dy = -sd.dI.dy; +#endif + #ifdef __VOLUME__ /* enter/exit volume */ kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); @@ -1363,6 +1164,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_write_light_passes(kg, buffer, &L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); +#endif + return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } @@ -1372,11 +1177,8 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uin { float filter_u; float filter_v; -#ifdef __CMJ__ + int num_samples = kernel_data.integrator.aa_samples; -#else - int num_samples = 0; -#endif path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 406654c1741..f29168642a4 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -16,17 +16,13 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample) +ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray) { - state->flag = PATH_RAY_CAMERA|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP; + state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; -#ifdef __CMJ__ state->num_samples = kernel_data.integrator.aa_samples; -#else - state->num_samples = 0; -#endif state->bounce = 0; state->diffuse_bounce = 0; @@ -45,7 +41,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG if(kernel_data.integrator.use_volumes) { /* initialize volume stack with volume we are inside of */ - kernel_volume_stack_init(kg, state->volume_stack); + kernel_volume_stack_init(kg, ray, state->volume_stack); /* seed RNG for cases where we can't use stratified samples */ state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); } @@ -63,8 +59,8 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - /* random number generator next bounce */ - state->rng_offset += PRNG_BOUNCE_NUM; + /* don't increase random number generator offset here, to avoid some + * unwanted patterns, see path_state_rng_1D_for_decision */ if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h new file mode 100644 index 00000000000..9553c2da0df --- /dev/null +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -0,0 +1,299 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) + +/* branched path tracing: connect path directly to position on one or more lights and add it to L */ +ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights) +{ +#ifdef __EMISSION__ + /* sample illumination from lights to find path contribution */ + if(!(sd->flag & SD_BSDF_HAS_EVAL)) + return; + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + if(sample_all_lights) { + /* lamp sampling */ + for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { + int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); + float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); + RNG lamp_rng = cmj_hash(*rng, i); + + if(kernel_data.integrator.pdf_triangles != 0.0f) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + float light_u, light_v; + path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + + /* mesh light sampling */ + if(kernel_data.integrator.pdf_triangles != 0.0f) { + int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); + float num_samples_inv = num_samples_adjust/num_samples; + + if(kernel_data.integrator.num_all_lights) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_u, light_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + /* only sample triangle lights */ + if(kernel_data.integrator.num_all_lights) + light_t = 0.5f*light_t; + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + } + else { + /* sample one light at random */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + /* sample random light */ + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + } + } + } +#endif +} + +/* branched path tracing: bounce off or through surface to with new direction stored in ray */ +ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, + float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* sample BSDF */ + float bsdf_pdf; + BsdfEval bsdf_eval; + float3 bsdf_omega_in; + differential3 bsdf_domega_in; + float bsdf_u, bsdf_v; + path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + int label; + + label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, + &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + + if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + + /* modify path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->D = bsdf_omega_in; + ray->t = FLT_MAX; +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = bsdf_domega_in; +#endif +#ifdef __OBJECT_MOTION__ + ray->time = sd->time; +#endif + +#ifdef __VOLUME__ + /* enter/exit volume */ + if(label & LABEL_TRANSMIT) + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +#endif + + /* branch RNG state */ + path_state_branch(state, sample, num_samples); + + /* set MIS state */ + state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX); + state->ray_pdf = bsdf_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + + return true; +} + +#endif + +/* path tracing: connect path directly to position on a light and add it to L */ +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +{ +#ifdef __EMISSION__ + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) + return; + + /* sample illumination from lights to find path contribution */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } +#endif +} + +/* path tracing: bounce off or through surface to with new direction stored in ray */ +ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* no BSDF? we can stop here */ + if(sd->flag & SD_BSDF) { + /* sample BSDF */ + float bsdf_pdf; + BsdfEval bsdf_eval; + float3 bsdf_omega_in; + differential3 bsdf_domega_in; + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + int label; + + label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, + &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + + if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + + /* set labels */ + if(!(label & LABEL_TRANSPARENT)) { + state->ray_pdf = bsdf_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf); + } + + /* update path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->D = bsdf_omega_in; + + if(state->bounce == 0) + ray->t -= sd->ray_length; /* clipping works through transparent */ + else + ray->t = FLT_MAX; + +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = bsdf_domega_in; +#endif + +#ifdef __VOLUME__ + /* enter/exit volume */ + if(label & LABEL_TRANSMIT) + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +#endif + return true; + } +#ifdef __VOLUME__ + else if(sd->flag & SD_HAS_ONLY_VOLUME) { + /* no surface shader but have a volume shader? act transparent */ + + /* update path state, count as transparent */ + path_state_next(kg, state, LABEL_TRANSPARENT); + + /* setup ray position, direction stays unchanged */ + ray->P = ray_offset(sd->P, -sd->Ng); +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; +#endif + + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); + return true; + } +#endif + else { + /* no bsdf or volume? */ + return false; + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h new file mode 100644 index 00000000000..d8143832294 --- /dev/null +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -0,0 +1,267 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __VOLUME_SCATTER__ + +ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +{ +#ifdef __EMISSION__ + if(!kernel_data.integrator.use_direct_light) + return; + + /* sample illumination from lights to find path contribution */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + Ray light_ray; + BsdfEval L_light; + LightSample ls; + bool is_lamp; + + /* connect to light from given point where shader has been evaluated */ +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + if(ls.pdf == 0.0f) + return; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } +#endif +} + +#ifdef __KERNEL_GPU__ +ccl_device_noinline +#else +ccl_device +#endif +bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +{ + /* sample phase function */ + float phase_pdf; + BsdfEval phase_eval; + float3 phase_omega_in; + differential3 phase_domega_in; + float phase_u, phase_v; + path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + int label; + + label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, + &phase_omega_in, &phase_domega_in, &phase_pdf); + + if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) + return false; + + /* modify throughput */ + path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); + + /* set labels */ + state->ray_pdf = phase_pdf; +#ifdef __LAMP_MIS__ + state->ray_t = 0.0f; +#endif + state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf); + + /* update path state */ + path_state_next(kg, state, label); + + /* setup ray */ + ray->P = sd->P; + ray->D = phase_omega_in; + ray->t = FLT_MAX; + +#ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD = phase_domega_in; +#endif + + return true; +} + +ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng, + ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L, + float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment) +{ +#ifdef __EMISSION__ + if(!kernel_data.integrator.use_direct_light) + return; + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + + if(sample_all_lights) { + /* lamp sampling */ + for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) { + int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); + float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); + RNG lamp_rng = cmj_hash(*rng, i); + + if(kernel_data.integrator.pdf_triangles != 0.0f) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + /* sample random position on given light */ + float light_u, light_v; + path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + + if(ls.pdf == 0.0f) + continue; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + + /* mesh light sampling */ + if(kernel_data.integrator.pdf_triangles != 0.0f) { + int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples); + float num_samples_inv = num_samples_adjust/num_samples; + + if(kernel_data.integrator.num_all_lights) + num_samples_inv *= 0.5f; + + for(int j = 0; j < num_samples; j++) { + /* sample random position on random triangle */ + float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); + float light_u, light_v; + path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + + /* only sample triangle lights */ + if(kernel_data.integrator.num_all_lights) + light_t = 0.5f*light_t; + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); + float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(ls.pdf == 0.0f) + continue; + + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + } + } + } + } + } + else { + /* sample random position on random light */ + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + + LightSample ls; + light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls); + + float3 tp = throughput; + + /* sample position on volume segment */ + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); + + (void)result; + kernel_assert(result == VOLUME_PATH_SCATTERED); + + /* todo: split up light_sample so we don't have to call it again with new position */ + light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls); + + if(ls.pdf == 0.0f) + return; + + /* sample random light */ + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, state, &light_ray, &shadow)) { + /* accumulate */ + path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + } + } + } +#endif +} + +#endif + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 31cb6ff6abd..236f74c0a82 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -261,22 +261,41 @@ ccl_device uint lcg_init(uint seed) * For branches in the path we must be careful not to reuse the same number * in a sequence and offset accordingly. */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) { return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +{ + /* the rng_offset is not increased for transparent bounces. if we do then + * fully transparent objects can become subtly visible by the different + * sampling patterns used where the transparent object is. + * + * however for some random numbers that will determine if we next bounce + * is transparent we do need to increase the offset to avoid always making + * the same decision */ + int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; + return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); +} + +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +{ + int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; + return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); +} + +ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); } @@ -290,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) { return lcg_init(*rng + state->rng_offset + state->sample*scramble); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 58cec090410..db08c328d7e 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -86,9 +86,8 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, #endif if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - float3 Ng = make_float3(Ns.x, Ns.y, Ns.z); - sd->shader = __float_as_int(Ns.w); + float3 Ng = triangle_normal(kg, sd); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ sd->P = triangle_refine(kg, sd, isect, ray); @@ -166,9 +165,8 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat /* fetch triangle data */ if(sd->type == PRIMITIVE_TRIANGLE) { - float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim); - float3 Ng = make_float3(Ns.x, Ns.y, Ns.z); - sd->shader = __float_as_int(Ns.w); + float3 Ng = triangle_normal(kg, sd); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* static triangle */ sd->P = triangle_refine_subsurface(kg, sd, isect, ray); @@ -342,7 +340,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f); int shader; - triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader); + triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); /* force smooth shading for displacement */ shader |= SHADER_SMOOTH_NORMAL; @@ -609,6 +607,9 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { + if(sd->flag & SD_HAS_ONLY_VOLUME) + return make_float3(1.0f, 1.0f, 1.0f); + float3 eval = make_float3(0.0f, 0.0f, 0.0f); for(int i = 0; i< sd->num_closure; i++) { @@ -797,8 +798,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, #ifdef __SVM__ svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag); #else - sd->closure.weight = make_float3(0.8f, 0.8f, 0.8f); - sd->closure.N = sd->N; + sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f); + sd->closure->N = sd->N; sd->flag |= bsdf_diffuse_setup(&sd->closure); #endif } @@ -857,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con if(phase_pdf != 0.0f) { bsdf_eval_accum(result_eval, sc->type, eval); - sum_pdf += phase_pdf; + sum_pdf += phase_pdf*sc->sample_weight; } sum_sample_weight += sc->sample_weight; @@ -1025,8 +1026,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect #ifdef __HAIR__ if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) { #endif - float4 Ns = kernel_tex_fetch(__tri_normal, prim); - shader = __float_as_int(Ns.w); + shader = kernel_tex_fetch(__tri_shader, prim); #ifdef __HAIR__ } else { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index ab7524c411a..61954282c28 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -64,18 +64,21 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * bool blocked; if(kernel_data.integrator.transparent_shadows) { + /* check transparent bounces here, for volume scatter which can do + * lighting before surface path termination is checked */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) + return true; + /* intersect to find an opaque surface, or record all transparent surface hits */ Intersection hits_stack[STACK_MAX_HITS]; - Intersection *hits; + Intersection *hits = hits_stack; uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1; /* prefer to use stack but use dynamic allocation if too deep max hits * we need max_hits + 1 storage space due to the logic in * scene_intersect_shadow_all which will first store and then check if * the limit is exceeded */ - if(max_hits + 1 <= STACK_MAX_HITS) - hits = hits_stack; - else + if(max_hits + 1 > STACK_MAX_HITS) hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1)); uint num_hits; @@ -152,7 +155,11 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * kernel_volume_shadow(kg, &ps, ray, &throughput); #endif - *shadow *= throughput; + *shadow = throughput; + + if(hits != hits_stack) + free(hits); + return is_zero(throughput); } /* free dynamic storage */ @@ -161,11 +168,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * } else { Intersection isect; -#ifdef __HAIR__ blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); -#else - blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect); -#endif } #ifdef __VOLUME__ @@ -178,6 +181,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return blocked; } +#undef STACK_MAX_HITS + #else /* Shadow function to compute how much light is blocked, GPU variation. @@ -196,11 +201,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return false; Intersection isect; -#ifdef __HAIR__ bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); -#else - bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect); -#endif #ifdef __TRANSPARENT_SHADOWS__ if(blocked && kernel_data.integrator.transparent_shadows) { @@ -216,11 +217,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * if(bounce >= kernel_data.integrator.transparent_max_bounce) return true; -#ifdef __HAIR__ if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) -#else - if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect)) -#endif { #ifdef __VOLUME__ diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index 2d5f6091908..740998e8c92 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -34,7 +34,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -64,9 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa /* Shader Evaluate */ -void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 1062fd0c990..da73a3a1c97 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -36,7 +36,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -66,9 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa /* Shader Evaluate */ -void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp index ba3b4887650..5704f60e138 100644 --- a/intern/cycles/kernel/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernel_sse41.cpp @@ -37,7 +37,7 @@ #include "kernel_globals.h" #include "kernel_film.h" #include "kernel_path.h" -#include "kernel_displace.h" +#include "kernel_bake.h" CCL_NAMESPACE_BEGIN @@ -67,9 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo /* Shader Evaluate */ -void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i) +void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample) { - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i); + if(type >= SHADER_EVAL_BAKE) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample); + else + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index b07075c6c95..ef46b2f707f 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -36,7 +36,7 @@ KERNEL_TEX(float4, texture_float4, __objects) KERNEL_TEX(float4, texture_float4, __objects_vector) /* triangles */ -KERNEL_TEX(float4, texture_float4, __tri_normal) +KERNEL_TEX(uint, texture_uint, __tri_shader) KERNEL_TEX(float4, texture_float4, __tri_vnormal) KERNEL_TEX(float4, texture_float4, __tri_vindex) KERNEL_TEX(float4, texture_float4, __tri_verts) @@ -49,6 +49,7 @@ KERNEL_TEX(float4, texture_float4, __curve_keys) KERNEL_TEX(uint4, texture_uint4, __attributes_map) KERNEL_TEX(float, texture_float, __attributes_float) KERNEL_TEX(float4, texture_float4, __attributes_float3) +KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4) /* lights */ KERNEL_TEX(float4, texture_float4, __light_distribution) @@ -172,10 +173,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_095) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_096) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) /* Kepler and above */ -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102) @@ -227,7 +227,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150) -#endif /* packed image (opencl) */ KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 11445aa1c93..cfac8d1e905 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -38,12 +38,14 @@ CCL_NAMESPACE_BEGIN #define BSSRDF_MIN_RADIUS 1e-8f #define BSSRDF_MAX_HITS 4 -#define BB_DRAPPER 800.0f +#define BB_DRAPER 800.0f #define BB_MAX_TABLE_RANGE 12000.0f #define BB_TABLE_XPOWER 1.5f #define BB_TABLE_YPOWER 5.0f #define BB_TABLE_SPACING 2.0f +#define BECKMANN_TABLE_SIZE 256 + #define TEX_NUM_FLOAT_IMAGES 5 #define SHADER_NONE (~0) @@ -64,6 +66,8 @@ CCL_NAMESPACE_BEGIN #define __SUBSURFACE__ #define __CMJ__ #define __VOLUME__ +#define __VOLUME_DECOUPLED__ +#define __VOLUME_SCATTER__ #define __SHADOW_RECORD_ALL__ #endif @@ -71,10 +75,15 @@ CCL_NAMESPACE_BEGIN #define __KERNEL_SHADING__ #define __KERNEL_ADV_SHADING__ #define __BRANCHED_PATH__ +#define __VOLUME__ +#define __VOLUME_SCATTER__ /* Experimental on GPU */ -//#define __VOLUME__ -//#define __SUBSURFACE__ +#ifdef __KERNEL_CUDA_EXPERIMENTAL__ +#define __SUBSURFACE__ +#define __CMJ__ +#endif + #endif #ifdef __KERNEL_OPENCL__ @@ -101,7 +110,6 @@ CCL_NAMESPACE_BEGIN #define __BACKGROUND_MIS__ #define __LAMP_MIS__ #define __AO__ -#define __ANISOTROPIC__ //#define __CAMERA_MOTION__ //#define __OBJECT_MOTION__ //#define __HAIR__ @@ -132,11 +140,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SHADING__ #define __SVM__ #define __EMISSION__ -#define __PROCEDURAL_TEXTURES__ -#define __IMAGE_TEXTURES__ +#define __TEXTURES__ #define __EXTRA_NODES__ #define __HOLDOUT__ -#define __NORMAL_MAP__ #endif #ifdef __KERNEL_ADV_SHADING__ @@ -146,12 +152,15 @@ CCL_NAMESPACE_BEGIN #define __BACKGROUND_MIS__ #define __LAMP_MIS__ #define __AO__ -#define __ANISOTROPIC__ #define __CAMERA_MOTION__ #define __OBJECT_MOTION__ #define __HAIR__ #endif +#ifdef WITH_CYCLES_DEBUG +# define __KERNEL_DEBUG__ +#endif + /* Random Numbers */ typedef uint RNG; @@ -221,10 +230,9 @@ enum PathTraceDimension { PRNG_PHASE_V = 9, PRNG_PHASE = 10, PRNG_SCATTER_DISTANCE = 11, - PRNG_BOUNCE_NUM = 12, -#else - PRNG_BOUNCE_NUM = 8, #endif + + PRNG_BOUNCE_NUM = 12, }; enum SamplingPattern { @@ -250,17 +258,17 @@ enum PathRayFlag { PATH_RAY_SHADOW_TRANSPARENT = 256, PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/ + PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ /* note that these can use maximum 12 bits, the other are for layers */ - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512), + PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024), - PATH_RAY_MIS_SKIP = 1024, - PATH_RAY_DIFFUSE_ANCESTOR = 2048, - PATH_RAY_GLOSSY_ANCESTOR = 4096, - PATH_RAY_BSSRDF_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_VOLUME_SCATTER = 32768, + PATH_RAY_MIS_SKIP = 2048, + PATH_RAY_DIFFUSE_ANCESTOR = 4096, + PATH_RAY_GLOSSY_ANCESTOR = 8192, + PATH_RAY_BSSRDF_ANCESTOR = 16384, + PATH_RAY_SINGLE_PASS_DONE = 32768, /* we need layer member flags to be the 20 upper bits */ PATH_RAY_LAYER_SHIFT = (32-20) @@ -283,32 +291,35 @@ typedef enum ClosureLabel { typedef enum PassType { PASS_NONE = 0, - PASS_COMBINED = 1, - PASS_DEPTH = 2, - PASS_NORMAL = 4, - PASS_UV = 8, - PASS_OBJECT_ID = 16, - PASS_MATERIAL_ID = 32, - PASS_DIFFUSE_COLOR = 64, - PASS_GLOSSY_COLOR = 128, - PASS_TRANSMISSION_COLOR = 256, - PASS_DIFFUSE_INDIRECT = 512, - PASS_GLOSSY_INDIRECT = 1024, - PASS_TRANSMISSION_INDIRECT = 2048, - PASS_DIFFUSE_DIRECT = 4096, - PASS_GLOSSY_DIRECT = 8192, - PASS_TRANSMISSION_DIRECT = 16384, - PASS_EMISSION = 32768, - PASS_BACKGROUND = 65536, - PASS_AO = 131072, - PASS_SHADOW = 262144, - PASS_MOTION = 524288, - PASS_MOTION_WEIGHT = 1048576, - PASS_MIST = 2097152, - PASS_SUBSURFACE_DIRECT = 4194304, - PASS_SUBSURFACE_INDIRECT = 8388608, - PASS_SUBSURFACE_COLOR = 16777216, - PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */ + PASS_COMBINED = (1 << 0), + PASS_DEPTH = (1 << 1), + PASS_NORMAL = (1 << 2), + PASS_UV = (1 << 3), + PASS_OBJECT_ID = (1 << 4), + PASS_MATERIAL_ID = (1 << 5), + PASS_DIFFUSE_COLOR = (1 << 6), + PASS_GLOSSY_COLOR = (1 << 7), + PASS_TRANSMISSION_COLOR = (1 << 8), + PASS_DIFFUSE_INDIRECT = (1 << 9), + PASS_GLOSSY_INDIRECT = (1 << 10), + PASS_TRANSMISSION_INDIRECT = (1 << 11), + PASS_DIFFUSE_DIRECT = (1 << 12), + PASS_GLOSSY_DIRECT = (1 << 13), + PASS_TRANSMISSION_DIRECT = (1 << 14), + PASS_EMISSION = (1 << 15), + PASS_BACKGROUND = (1 << 16), + PASS_AO = (1 << 17), + PASS_SHADOW = (1 << 18), + PASS_MOTION = (1 << 19), + PASS_MOTION_WEIGHT = (1 << 20), + PASS_MIST = (1 << 21), + PASS_SUBSURFACE_DIRECT = (1 << 22), + PASS_SUBSURFACE_INDIRECT = (1 << 23), + PASS_SUBSURFACE_COLOR = (1 << 24), + PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */ +#ifdef __KERNEL_DEBUG__ + PASS_BVH_TRAVERSAL_STEPS = (1 << 26), +#endif } PassType; #define PASS_ALL (~0) @@ -330,21 +341,25 @@ typedef struct PathRadiance { float3 color_glossy; float3 color_transmission; float3 color_subsurface; + float3 color_scatter; float3 direct_diffuse; float3 direct_glossy; float3 direct_transmission; float3 direct_subsurface; + float3 direct_scatter; float3 indirect_diffuse; float3 indirect_glossy; float3 indirect_transmission; float3 indirect_subsurface; + float3 indirect_scatter; float3 path_diffuse; float3 path_glossy; float3 path_transmission; float3 path_subsurface; + float3 path_scatter; float4 shadow; float mist; @@ -358,6 +373,7 @@ typedef struct BsdfEval { float3 transmission; float3 transparent; float3 subsurface; + float3 scatter; } BsdfEval; #else @@ -378,7 +394,8 @@ typedef enum ShaderFlag { SHADER_EXCLUDE_GLOSSY = (1 << 26), SHADER_EXCLUDE_TRANSMIT = (1 << 25), SHADER_EXCLUDE_CAMERA = (1 << 24), - SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA), + SHADER_EXCLUDE_SCATTER = (1 << 23), + SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA|SHADER_EXCLUDE_SCATTER), SHADER_MASK = ~(SHADER_SMOOTH_NORMAL|SHADER_CAST_SHADOW|SHADER_AREA_LIGHT|SHADER_USE_MIS|SHADER_EXCLUDE_ANY) } ShaderFlag; @@ -390,10 +407,8 @@ typedef enum LightType { LIGHT_DISTANT, LIGHT_BACKGROUND, LIGHT_AREA, - LIGHT_AO, LIGHT_SPOT, - LIGHT_TRIANGLE, - LIGHT_STRAND + LIGHT_TRIANGLE } LightType; /* Camera Type */ @@ -445,6 +460,10 @@ typedef struct Intersection { int prim; int object; int type; + +#ifdef __KERNEL_DEBUG__ + int num_traversal_steps; +#endif } Intersection; /* Primitives */ @@ -478,6 +497,7 @@ typedef enum AttributeElement { ATTR_ELEMENT_VERTEX, ATTR_ELEMENT_VERTEX_MOTION, ATTR_ELEMENT_CORNER, + ATTR_ELEMENT_CORNER_BYTE, ATTR_ELEMENT_CURVE, ATTR_ELEMENT_CURVE_KEY, ATTR_ELEMENT_CURVE_KEY_MOTION, @@ -519,24 +539,32 @@ typedef enum AttributeStandard { #define MAX_CLOSURE 1 #endif +/* TODO(sergey): This is rather nasty bug happening in here, which + * could be simply a compilers bug for which we can't find a generic + * platform independent workaround. Also even if it's a compiler + * issue, it's not so simple to upgrade the compiler in the release + * environment for linux and doing it so closer to the release is + * rather a risky business. + * + * For this release it's probably safer to stick with such a rather + * dirty solution, and look for a cleaner fix during the next release + * cycle. + */ typedef struct ShaderClosure { ClosureType type; float3 weight; - +#ifndef __APPLE__ float sample_weight; - +#endif float data0; float data1; + float data2; float3 N; -#if defined(__ANISOTROPIC__) || defined(__SUBSURFACE__) || defined(__HAIR__) float3 T; +#ifdef __APPLE__ + float sample_weight; #endif - -#ifdef __HAIR__ - float offset; -#endif - #ifdef __OSL__ void *prim; #endif @@ -563,37 +591,49 @@ typedef enum ShaderContext { enum ShaderDataFlag { /* runtime flags */ - SD_BACKFACING = 1, /* backside of surface? */ - SD_EMISSION = 2, /* have emissive closure? */ - SD_BSDF = 4, /* have bsdf closure? */ - SD_BSDF_HAS_EVAL = 8, /* have non-singular bsdf closure? */ - SD_PHASE_HAS_EVAL = 8, /* have non-singular phase closure? */ - SD_BSDF_GLOSSY = 16, /* have glossy bsdf */ - SD_BSSRDF = 32, /* have bssrdf */ - SD_HOLDOUT = 64, /* have holdout closure? */ - SD_ABSORPTION = 128, /* have volume absorption closure? */ - SD_SCATTER = 256, /* have volume phase closure? */ - SD_AO = 512, /* have ao closure? */ - SD_TRANSPARENT = 1024, /* have transparent closure? */ - - SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO), + SD_BACKFACING = (1 << 0), /* backside of surface? */ + SD_EMISSION = (1 << 1), /* have emissive closure? */ + SD_BSDF = (1 << 2), /* have bsdf closure? */ + SD_BSDF_HAS_EVAL = (1 << 3), /* have non-singular bsdf closure? */ + SD_PHASE_HAS_EVAL = (1 << 3), /* have non-singular phase closure? */ + SD_BSDF_GLOSSY = (1 << 4), /* have glossy bsdf */ + SD_BSSRDF = (1 << 5), /* have bssrdf */ + SD_HOLDOUT = (1 << 6), /* have holdout closure? */ + SD_ABSORPTION = (1 << 7), /* have volume absorption closure? */ + SD_SCATTER = (1 << 8), /* have volume phase closure? */ + SD_AO = (1 << 9), /* have ao closure? */ + SD_TRANSPARENT = (1 << 10), /* have transparent closure? */ + + SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY| + SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO), /* shader flags */ - SD_USE_MIS = 2048, /* direct light sample */ - SD_HAS_TRANSPARENT_SHADOW = 4096, /* has transparent shadow */ - SD_HAS_VOLUME = 8192, /* has volume shader */ - SD_HAS_ONLY_VOLUME = 16384, /* has only volume shader, no surface */ - SD_HETEROGENEOUS_VOLUME = 32768, /* has heterogeneous volume */ - SD_HAS_BSSRDF_BUMP = 65536, /* bssrdf normal uses bump */ - - SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|SD_HAS_BSSRDF_BUMP), + SD_USE_MIS = (1 << 11), /* direct light sample */ + SD_HAS_TRANSPARENT_SHADOW = (1 << 12), /* has transparent shadow */ + SD_HAS_VOLUME = (1 << 13), /* has volume shader */ + SD_HAS_ONLY_VOLUME = (1 << 14), /* has only volume shader, no surface */ + SD_HETEROGENEOUS_VOLUME = (1 << 15), /* has heterogeneous volume */ + SD_HAS_BSSRDF_BUMP = (1 << 16), /* bssrdf normal uses bump */ + SD_VOLUME_EQUIANGULAR = (1 << 17), /* use equiangular sampling */ + SD_VOLUME_MIS = (1 << 18), /* use multiple importance sampling */ + SD_VOLUME_CUBIC = (1 << 19), /* use cubic interpolation for voxels */ + + SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME| + SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME| + SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS| + SD_VOLUME_CUBIC), /* object flags */ - SD_HOLDOUT_MASK = 131072, /* holdout for camera rays */ - SD_OBJECT_MOTION = 262144, /* has object motion blur */ - SD_TRANSFORM_APPLIED = 524288, /* vertices have transform applied */ - - SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED) + SD_HOLDOUT_MASK = (1 << 20), /* holdout for camera rays */ + SD_OBJECT_MOTION = (1 << 21), /* has object motion blur */ + SD_TRANSFORM_APPLIED = (1 << 22), /* vertices have transform applied */ + SD_NEGATIVE_SCALE_APPLIED = (1 << 23), /* vertices have negative scale applied */ + SD_OBJECT_HAS_VOLUME = (1 << 24), /* object has a volume shader */ + SD_OBJECT_INTERSECTS_VOLUME = (1 << 25), /* object intersects AABB of an object with volume shader */ + + SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED| + SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME| + SD_OBJECT_INTERSECTS_VOLUME) }; struct KernelGlobals; @@ -686,9 +726,10 @@ typedef struct PathState { int flag; /* random number generator state */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ + int rng_offset; /* dimension offset */ + int rng_offset_bsdf; /* dimension offset for picking bsdf */ + int sample; /* path sample number */ + int num_samples; /* total number of times this path will be sampled */ /* bounce counting */ int bounce; @@ -756,9 +797,12 @@ typedef struct KernelCamera { /* render size */ float width, height; int resolution; - int pad1; + + /* anamorphic lens bokeh */ + float inv_aperture_ratio; + + int is_inside_volume; int pad2; - int pad3; /* more matrices */ Transform screentoworld; @@ -819,6 +863,11 @@ typedef struct KernelFilm { float mist_start; float mist_inv_depth; float mist_falloff; + +#ifdef __KERNEL_DEBUG__ + int pass_bvh_traversal_steps; + int pass_pad3, pass_pad4, pass_pad5; +#endif } KernelFilm; typedef struct KernelBackground { @@ -860,7 +909,8 @@ typedef struct KernelIntegrator { int transparent_shadows; /* caustics */ - int no_caustics; + int caustics_reflective; + int caustics_refractive; float filter_glossy; /* seed */ @@ -892,7 +942,6 @@ typedef struct KernelIntegrator { int aa_samples; /* volume render */ - int volume_homogeneous_sampling; int use_volumes; int volume_max_steps; float volume_step_size; @@ -922,7 +971,6 @@ typedef enum CurveFlag { } CurveFlag; typedef struct KernelCurves { - /* strand intersect and normal parameters - many can be changed to flags */ int curveflags; int subdivisions; @@ -930,11 +978,11 @@ typedef struct KernelCurves { float maximum_width; } KernelCurves; -typedef struct KernelBlackbody { - int table_offset; - int pad1, pad2, pad3; -} KernelBlackbody; - +typedef struct KernelTables { + int blackbody_offset; + int beckmann_offset; + int pad1, pad2; +} KernelTables; typedef struct KernelData { KernelCamera cam; @@ -943,9 +991,17 @@ typedef struct KernelData { KernelIntegrator integrator; KernelBVH bvh; KernelCurves curve; - KernelBlackbody blackbody; + KernelTables tables; } KernelData; +#ifdef __KERNEL_DEBUG__ +typedef struct DebugData { + // Total number of BVH node travesal steps and primitives intersections + // for the camera rays. + int num_bvh_traversal_steps; +} DebugData; +#endif + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index faaa68e3309..ce20f20e75a 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -116,6 +116,36 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st return false; } +ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack) +{ + if(kernel_data.integrator.num_all_lights == 0) + return 0; + + int method = -1; + + for(int i = 0; stack[i].shader != SHADER_NONE; i++) { + int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2); + + if(shader_flag & SD_VOLUME_MIS) { + return SD_VOLUME_MIS; + } + else if(shader_flag & SD_VOLUME_EQUIANGULAR) { + if(method == 0) + return SD_VOLUME_MIS; + + method = SD_VOLUME_EQUIANGULAR; + } + else { + if(method == SD_VOLUME_EQUIANGULAR) + return SD_VOLUME_MIS; + + method = 0; + } + } + + return method; +} + /* Volume Shadows * * These functions are used to attenuate shadow rays to lights. Both absorption @@ -136,7 +166,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) { float3 tp = *throughput; - const float tp_eps = 1e-10f; /* todo: this is likely not the right value */ + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; @@ -146,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* compute extinction at the start */ float t = 0.0f; + float3 sum = make_float3(0.0f, 0.0f, 0.0f); + for(int i = 0; i < max_steps; i++) { /* advance to new position */ float new_t = min(ray->t, (i+1) * step); @@ -160,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* compute attenuation over segment */ if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) { - /* todo: we could avoid computing expf() for each step by summing, - * because exp(a)*exp(b) = exp(a+b), but we still want a quick - * tp_eps check too */ - tp *= volume_color_transmittance(sigma_t, new_t - t); - - /* stop if nearly all light blocked */ - if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps) - break; + /* Compute expf() only for every Nth step, to save some calculations + * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */ + + sum += (-sigma_t * (new_t - t)); + if((i & 0x07) == 0) { /* ToDo: Other interval? */ + tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z)); + + /* stop if nearly all light is blocked */ + if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps) + break; + } } /* stop if at the end of the volume */ t = new_t; - if(t == ray->t) + if(t == ray->t) { + /* Update throughput in case we haven't done it above */ + tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z)); break; + } } *throughput = tp; @@ -226,33 +264,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s return pdf; } -ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P) -{ - /* light RNGs */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); - float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - - /* light sample */ - LightSample ls; - light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls); - if(ls.pdf == 0.0f) - return false; - - *light_P = ls.P; - return true; -} - -ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t) -{ - float3 light_P; - - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return 0.0f; - - return kernel_volume_equiangular_pdf(ray, light_P, sample_t); -} - /* Distance sampling */ ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf) @@ -312,7 +323,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe * the volume shading coefficient for the entire line segment */ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, - RNG *rng) + RNG *rng, bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -323,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba float t = ray->t; float3 new_tp; +#ifdef __VOLUME_SCATTER__ /* randomly scatter, and if we do t is shortened */ if(closure_flag & SD_SCATTER) { /* extinction coefficient */ @@ -330,43 +342,41 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; - float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); - /* decide if we will hit or miss */ - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); - float sample_transmittance = expf(-sample_sigma_t * t); + bool scatter = true; + float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + + if(probalistic_scatter) { + float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); + float sample_transmittance = expf(-sample_sigma_t * t); + + if(1.0f - xi >= sample_transmittance) { + scatter = true; + + /* rescale random number so we can reuse it */ + xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance); - if(xi >= sample_transmittance) { + } + else + scatter = false; + } + + if(scatter) { /* scattering */ float3 pdf; float3 transmittance; float sample_t; - /* rescale random number so we can reuse it */ - xi = (xi - sample_transmittance)/(1.0f - sample_transmittance); - - if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { - /* distance sampling */ - sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf); - } - else { - /* equiangular sampling */ - float3 light_P; - float equi_pdf; - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return VOLUME_PATH_MISSED; - - sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf); - transmittance = volume_color_transmittance(sigma_t, sample_t); - pdf = make_float3(equi_pdf, equi_pdf, equi_pdf); - } + /* distance sampling */ + sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf); /* modifiy pdf for hit/miss decision */ - pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t); + if(probalistic_scatter) + pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t); new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf); t = sample_t; @@ -378,14 +388,16 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba new_tp = *throughput * transmittance / pdf; } } - else if(closure_flag & SD_ABSORPTION) { + else +#endif + if(closure_flag & SD_ABSORPTION) { /* absorption only, no sampling needed */ float3 transmittance = volume_color_transmittance(coeff.sigma_a, t); new_tp = *throughput * transmittance; } /* integrate emission attenuated by extinction */ - if(closure_flag & SD_EMISSION) { + if(L && (closure_flag & SD_EMISSION)) { float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 transmittance = volume_color_transmittance(sigma_t, ray->t); float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t); @@ -408,13 +420,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba return VOLUME_PATH_ATTENUATED; } -/* heterogeneous volume: integrate stepping through the volume until we - * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg, +/* heterogeneous volume distance sampling: integrate stepping through the + * volume until we reach the end, get absorbed entirely, or run out of + * iterations. this does probalistically scatter or get transmitted through + * for path tracing where we don't want to branch. */ +ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng) { float3 tp = *throughput; - const float tp_eps = 1e-10f; /* todo: this is likely not the right value */ + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; @@ -425,9 +439,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float t = 0.0f; float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f); - /* cache some constant variables */ - float xi; - int channel = -1; + /* pick random color channel, we use the Veach one-sample + * model with balance heuristic for the channels */ + float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + int channel = (int)(rphase*3.0f); + sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; for(int i = 0; i < max_steps; i++) { @@ -449,25 +466,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float3 transmittance; bool scatter = false; - /* randomly scatter, and if we do dt and new_t are shortened */ + /* distance sampling */ +#ifdef __VOLUME_SCATTER__ if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) { has_scatter = true; - /* average sigma_t and sigma_s over segment */ float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 sigma_s = coeff.sigma_s; - /* lazily set up variables for sampling */ - if(channel == -1) { - /* pick random color channel, we use the Veach one-sample - * model with balance heuristic for the channels */ - xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); - - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); - channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; - } - /* compute transmittance over full step */ transmittance = volume_color_transmittance(sigma_t, dt); @@ -480,10 +486,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo float new_dt = -logf(1.0f - xi)/sample_sigma_t; new_t = t + new_dt; - /* transmittance, throughput */ + /* transmittance and pdf */ float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt); - float pdf = average(sigma_t * new_transmittance); - new_tp = tp * sigma_s * new_transmittance / pdf; + float3 pdf = sigma_t * new_transmittance; + + /* throughput */ + new_tp = tp * sigma_s * new_transmittance / average(pdf); scatter = true; } else { @@ -495,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo xi = 1.0f - (1.0f - xi)/sample_transmittance; } } - else if(closure_flag & SD_ABSORPTION) { + else +#endif + if(closure_flag & SD_ABSORPTION) { /* absorption only, no sampling needed */ float3 sigma_a = coeff.sigma_a; @@ -504,7 +514,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo } /* integrate emission attenuated by absorption */ - if(closure_flag & SD_EMISSION) { + if(L && (closure_flag & SD_EMISSION)) { float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt); path_radiance_accum_emission(L, tp, emission, state->bounce); } @@ -518,19 +528,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo tp = make_float3(0.0f, 0.0f, 0.0f); break; } + } - /* prepare to scatter to new direction */ - if(scatter) { - /* adjust throughput and move to new location */ - sd->P = ray->P + new_t*ray->D; - *throughput = tp; + /* prepare to scatter to new direction */ + if(scatter) { + /* adjust throughput and move to new location */ + sd->P = ray->P + new_t*ray->D; + *throughput = tp; - return VOLUME_PATH_SCATTERED; - } - else { - /* accumulate transmittance */ - accum_transmittance *= transmittance; - } + return VOLUME_PATH_SCATTERED; + } + else { + /* accumulate transmittance */ + accum_transmittance *= transmittance; } } @@ -545,14 +555,34 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo return VOLUME_PATH_ATTENUATED; } +/* get the volume attenuation and emission over line segment defined by + * ray, with the assumption that there are no surfaces blocking light + * between the endpoints. distance sampling is used to decide if we will + * scatter or not. */ +ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, + PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous) +{ + /* workaround to fix correlation bug in T38710, can find better solution + * in random number generator later, for now this is done here to not impact + * performance of rendering without volumes */ + RNG tmp_rng = cmj_hash(*rng, state->rng_offset); + + shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); + + if(heterogeneous) + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng); + else + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true); +} + /* Decoupled Volume Sampling * * VolumeSegment is list of coefficients and transmittance stored at all steps * through a volume. This can then latter be used for decoupled sampling as in: - * "Importance Sampling Techniques for Path Tracing in Participating Media" */ - -/* CPU only because of malloc/free */ -#ifdef __KERNEL_CPU__ + * "Importance Sampling Techniques for Path Tracing in Participating Media" + * + * On the GPU this is only supported for homogeneous volumes (1 step), due to + * no support for malloc/free and too much stack usage with a fix size array. */ typedef struct VolumeStep { float3 sigma_s; /* scatter coefficient */ @@ -571,6 +601,8 @@ typedef struct VolumeSegment { float3 accum_emission; /* accumulated emission at end of segment */ float3 accum_transmittance; /* accumulated transmittance at end of segment */ + + int sampling_method; /* volume sampling method */ } VolumeSegment; /* record volume steps to the end of the volume. @@ -578,10 +610,12 @@ typedef struct VolumeSegment { * it would be nice if we could only record up to the point that we need to scatter, * but the entire segment is needed to do always scattering, rather than probalistically * hitting or missing the volume. if we don't know the transmittance at the end of the - * volume we can't generate stratitied distance samples up to that transmittance */ + * volume we can't generate stratified distance samples up to that transmittance */ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { + const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ + /* prepare for volume stepping */ int max_steps; float step_size, random_jitter_offset; @@ -608,6 +642,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta segment->closure_flag = 0; segment->numsteps = 0; + segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps); VolumeStep *step = segment->steps; @@ -669,6 +704,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta t = new_t; if(t == ray->t) break; + + /* stop if nearly all light blocked */ + if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps) + break; } /* store total emission and transmittance */ @@ -698,35 +737,70 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s * scattering, they always scatter if there is any non-zero scattering * coefficient. * - * these also do not do emission or modify throughput. */ + * these also do not do emission or modify throughput. + * + * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, - float3 *throughput, RNG *rng, VolumeSegment *segment) + float3 *throughput, float rphase, float rscatter, + const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter) { - int closure_flag = segment->closure_flag; - - if(!(closure_flag & SD_SCATTER)) - return VOLUME_PATH_MISSED; + kernel_assert(segment->closure_flag & SD_SCATTER); /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE); int channel = (int)(rphase*3.0f); sd->randb_closure = rphase*3.0f - channel; + float xi = rscatter; - float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE); + /* probalistic scattering decision based on transmittance */ + if(probalistic_scatter) { + float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel); + + if(1.0f - xi >= sample_transmittance) { + /* rescale random number so we can reuse it */ + xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance); + } + else { + *throughput /= sample_transmittance; + return VOLUME_PATH_MISSED; + } + } VolumeStep *step; float3 transmittance; float pdf, sample_t; + float mis_weight = 1.0f; + bool distance_sample = true; + bool use_mis = false; + + if(segment->sampling_method && light_P) { + if(segment->sampling_method == SD_VOLUME_MIS) { + /* multiple importance sample: randomly pick between + * equiangular and distance sampling strategy */ + if(xi < 0.5f) { + xi *= 2.0f; + } + else { + xi = (xi - 0.5f)*2.0f; + distance_sample = false; + } + + use_mis = true; + } + else { + /* only equiangular sampling */ + distance_sample = false; + } + } /* distance sampling */ - if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { + if(distance_sample) { /* find step in cdf */ step = segment->steps; float prev_t = 0.0f; - float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f); + float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f); if(segment->numsteps > 1) { float prev_cdf = 0.0f; @@ -749,7 +823,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( xi = (xi - prev_cdf)/(step_cdf - prev_cdf); /* pdf for picking step */ - step_pdf = step->cdf_distance - prev_cdf_distance; + step_pdf_distance = step->cdf_distance - prev_cdf_distance; } /* determine range in which we will sample */ @@ -758,35 +832,77 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* sample distance and compute transmittance */ float3 distance_pdf; sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf); - pdf = average(distance_pdf * step_pdf); + + /* modifiy pdf for hit/miss decision */ + if(probalistic_scatter) + distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance; + + pdf = average(distance_pdf * step_pdf_distance); + + /* multiple importance sampling */ + if(use_mis) { + float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t); + mis_weight = 2.0f*power_heuristic(pdf, equi_pdf); + } } /* equi-angular sampling */ else { - /* pick position on light */ - float3 light_P; - if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P)) - return VOLUME_PATH_MISSED; - /* sample distance */ - sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf); + sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf); /* find step in which sampled distance is located */ step = segment->steps; float prev_t = 0.0f; + float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f); if(segment->numsteps > 1) { - /* todo: optimize using binary search */ - for(int i = 0; i < segment->numsteps-1; i++, step++) { - if(sample_t < step->t) + float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f); + + int numsteps = segment->numsteps; + int high = numsteps - 1; + int low = 0; + int mid; + + while(low < high) { + mid = (low + high) >> 1; + + if(sample_t < step[mid].t) + high = mid; + else if(sample_t >= step[mid + 1].t) + low = mid + 1; + else { + /* found our interval in step[mid] .. step[mid+1] */ + prev_t = step[mid].t; + prev_cdf_distance = step[mid].cdf_distance; + step += mid+1; break; + } + } - prev_t = step->t; + if(low >= numsteps - 1) { + prev_t = step[numsteps - 1].t; + prev_cdf_distance = step[numsteps-1].cdf_distance; + step += numsteps - 1; } + + /* pdf for picking step with distance sampling */ + step_pdf_distance = step->cdf_distance - prev_cdf_distance; } - + + /* determine range in which we will sample */ + float step_t = step->t - prev_t; + float step_sample_t = sample_t - prev_t; + /* compute transmittance */ - transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t); + transmittance = volume_color_transmittance(step->sigma_t, step_sample_t); + + /* multiple importance sampling */ + if(use_mis) { + float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t); + float distance_pdf = average(distance_pdf3 * step_pdf_distance); + mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); + } } /* compute transmittance up to this step */ @@ -794,7 +910,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( transmittance *= (step-1)->accum_transmittance; /* modify throughput */ - *throughput *= step->sigma_s * transmittance / pdf; + *throughput *= step->sigma_s * transmittance * (mis_weight / pdf); /* evaluate shader to create closures at shading point */ if(segment->numsteps > 1) { @@ -810,40 +926,27 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } -#endif - -/* get the volume attenuation and emission over line segment defined by - * ray, with the assumption that there are no surfaces blocking light - * between the endpoints */ -ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, - PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng) +/* decide if we need to use decoupled or not */ +ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state->rng_offset); - bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); - -#if 0 - /* debugging code to compare decoupled ray marching */ - VolumeSegment segment; - - shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); - kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment); - - kernel_volume_decoupled_free(kg, &segment); + /* decoupled ray marching for heterogenous volumes not supported on the GPU, + * which also means equiangular and multiple importance sampling is not + * support for that case */ +#ifdef __KERNEL_GPU__ + if(heterogeneous) + return false; +#endif - return result; -#else - shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce); + /* equiangular and multiple importance sampling only implemented for decoupled */ + if(sampling_method != 0) + return true; - if(heterogeneous) - return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng); + /* for all light sampling use decoupled, reusing shader evaluations is + * typically faster in that case */ + if(direct) + return kernel_data.integrator.sample_all_lights_direct; else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng); -#endif + return kernel_data.integrator.sample_all_lights_indirect; } /* Volume Stack @@ -851,17 +954,88 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals * This is an array of object/shared ID's that the current segment of the path * is inside of. */ -ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack) +ccl_device void kernel_volume_stack_init(KernelGlobals *kg, + Ray *ray, + VolumeStack *stack) { - /* todo: this assumes camera is always in air, need to detect when it isn't */ - if(kernel_data.background.volume_shader == SHADER_NONE) { - stack[0].shader = SHADER_NONE; + /* NULL ray happens in the baker, does it need proper initialization of + * camera in volume? + */ + if(!kernel_data.cam.is_inside_volume || ray == NULL) { + /* Camera is guaranteed to be in the air, only take background volume + * into account in this case. + */ + if(kernel_data.background.volume_shader != SHADER_NONE) { + stack[0].shader = kernel_data.background.volume_shader; + stack[0].object = PRIM_NONE; + stack[1].shader = SHADER_NONE; + } + else { + stack[0].shader = SHADER_NONE; + } + return; } - else { + + Ray volume_ray = *ray; + volume_ray.t = FLT_MAX; + + int stack_index = 0, enclosed_index = 0; + int enclosed_volumes[VOLUME_STACK_SIZE]; + + while(stack_index < VOLUME_STACK_SIZE - 1 && + enclosed_index < VOLUME_STACK_SIZE - 1) + { + Intersection isect; + if(!scene_intersect_volume(kg, &volume_ray, &isect)) { + break; + } + + ShaderData sd; + shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0); + if(sd.flag & SD_HAS_VOLUME) { + if(sd.flag & SD_BACKFACING) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + bool is_enclosed = false; + for(int i = 0; i < enclosed_index; ++i) { + if(enclosed_volumes[i] == sd.object) { + is_enclosed = true; + break; + } + } + if(is_enclosed == false) { + stack[stack_index].object = sd.object; + stack[stack_index].shader = sd.shader; + ++stack_index; + } + } + else { + /* If ray from camera enters the volume, this volume shouldn't + * be added to the stak on exit. + */ + enclosed_volumes[enclosed_index++] = sd.object; + } + } + + /* Move ray forward. */ + volume_ray.P = ray_offset(sd.P, -sd.Ng); + } + /* stack_index of 0 means quick checks outside of the kernel gave false + * positive, nothing to worry about, just we've wasted quite a few of + * ticks just to come into conclusion that camera is in the air. + * + * In this case we're doing the same above -- check whether background has + * volume. + */ + if(stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) { stack[0].shader = kernel_data.background.volume_shader; stack[0].object = PRIM_NONE; stack[1].shader = SHADER_NONE; } + else { + stack[stack_index].shader = SHADER_NONE; + } } ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack) @@ -910,4 +1084,3 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd } CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript index 4685bb7753e..d721edbaf6e 100644 --- a/intern/cycles/kernel/osl/SConscript +++ b/intern/cycles/kernel/osl/SConscript @@ -43,6 +43,9 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {') defs.append('CCL_NAMESPACE_END=}') defs.append('WITH_OSL') +if env['WITH_BF_CYCLES_DEBUG']: + defs.append('WITH_CYCLES_DEBUG') + if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'): cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split()) incs.append(env['BF_PTHREADS_INC']) diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 94337290d20..84ef85e089d 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -66,18 +66,6 @@ ClosureParam *closure_bssrdf_cubic_params() static ClosureParam params[] = { CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N), CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius), - //CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1), - CLOSURE_STRING_KEYPARAM("label"), - CLOSURE_FINISH_PARAM(CubicBSSRDFClosure) - }; - return params; -} - -ClosureParam *closure_bssrdf_cubic_extended_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N), - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius), CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1), CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x), CLOSURE_STRING_KEYPARAM("label"), @@ -107,18 +95,6 @@ ClosureParam *closure_bssrdf_gaussian_params() static ClosureParam params[] = { CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N), CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius), - //CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1), - CLOSURE_STRING_KEYPARAM("label"), - CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure) - }; - return params; -} - -ClosureParam *closure_bssrdf_gaussian_extended_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N), - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius), CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1), CLOSURE_STRING_KEYPARAM("label"), CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure) diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index a96c0e2b1fb..cc9942b024e 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -41,6 +41,8 @@ #include "util_param.h" #include "kernel_types.h" +#include "kernel_compat_cpu.h" +#include "kernel_globals.h" #include "kernel_montecarlo.h" #include "closure/bsdf_util.h" @@ -51,8 +53,7 @@ #include "closure/bsdf_reflection.h" #include "closure/bsdf_refraction.h" #include "closure/bsdf_transparent.h" -#include "closure/bsdf_ward.h" -#include "closure/bsdf_westin.h" +#include "closure/bsdf_ashikhmin_shirley.h" #include "closure/bsdf_toon.h" #include "closure/bsdf_hair.h" #include "closure/volume.h" @@ -85,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR) CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0), BSDF_CLOSURE_CLASS_END(Refraction, refraction) -BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N), - CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0), -BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter) - -BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE) - CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N), - CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0), -BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen) - BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR) BSDF_CLOSURE_CLASS_END(Transparent, transparent) @@ -103,12 +94,12 @@ BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LA CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0), BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet) -BSDF_CLOSURE_CLASS_BEGIN(Ward, ward, ward, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(WardClosure, sc.N), - CLOSURE_FLOAT3_PARAM(WardClosure, sc.T), - CLOSURE_FLOAT_PARAM(WardClosure, sc.data0), - CLOSURE_FLOAT_PARAM(WardClosure, sc.data1), -BSDF_CLOSURE_CLASS_END(Ward, ward) +BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N), + CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T), + CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0), + CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso) BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, diffuse_toon, LABEL_DIFFUSE) CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, sc.N), @@ -122,26 +113,40 @@ BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY) CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1), BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT) CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0), BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0), + CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso) + +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT) CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0), BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT) + CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N), + CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1), +BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso) + +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT) CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0), - CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data1), + CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2), BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction) -BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY) +BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT) CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N), CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0), - CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data1), + CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2), BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL_GLOSSY) @@ -150,7 +155,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), #ifdef __HAIR__ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T), - CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset), + CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2), #else CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), @@ -163,7 +168,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission, CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1), #ifdef __HAIR__ CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T), - CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset), + CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2), #else CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1), @@ -210,26 +215,24 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_transparent_params(), bsdf_transparent_prepare); register_closure(ss, "microfacet_ggx", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare); + register_closure(ss, "microfacet_ggx_aniso", id++, + bsdf_microfacet_ggx_aniso_params(), bsdf_microfacet_ggx_aniso_prepare); register_closure(ss, "microfacet_ggx_refraction", id++, bsdf_microfacet_ggx_refraction_params(), bsdf_microfacet_ggx_refraction_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); + register_closure(ss, "microfacet_beckmann_aniso", id++, + bsdf_microfacet_beckmann_aniso_params(), bsdf_microfacet_beckmann_aniso_prepare); register_closure(ss, "microfacet_beckmann_refraction", id++, bsdf_microfacet_beckmann_refraction_params(), bsdf_microfacet_beckmann_refraction_prepare); - register_closure(ss, "ward", id++, - bsdf_ward_params(), bsdf_ward_prepare); + register_closure(ss, "ashikhmin_shirley", id++, + bsdf_ashikhmin_shirley_aniso_params(), bsdf_ashikhmin_shirley_aniso_prepare); register_closure(ss, "ashikhmin_velvet", id++, bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare); register_closure(ss, "diffuse_toon", id++, bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); - register_closure(ss, "specular_toon", id++, - bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); - register_closure(ss, "westin_backscatter", id++, - bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare); - register_closure(ss, "westin_sheen", id++, - bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -247,10 +250,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare); register_closure(ss, "bssrdf_gaussian", id++, closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); - register_closure(ss, "bssrdf_cubic", id++, - closure_bssrdf_cubic_extended_params(), closure_bssrdf_cubic_prepare); - register_closure(ss, "bssrdf_gaussian", id++, - closure_bssrdf_gaussian_extended_params(), closure_bssrdf_gaussian_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index 218cf1c19cc..5e833d738d8 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -48,12 +48,8 @@ OSL::ClosureParam *closure_holdout_params(); OSL::ClosureParam *closure_ambient_occlusion_params(); OSL::ClosureParam *closure_bsdf_diffuse_ramp_params(); OSL::ClosureParam *closure_bsdf_phong_ramp_params(); -OSL::ClosureParam *closure_westin_backscatter_params(); -OSL::ClosureParam *closure_westin_sheen_params(); OSL::ClosureParam *closure_bssrdf_cubic_params(); OSL::ClosureParam *closure_bssrdf_gaussian_params(); -OSL::ClosureParam *closure_bssrdf_cubic_extended_params(); -OSL::ClosureParam *closure_bssrdf_gaussian_extended_params(); OSL::ClosureParam *closure_henyey_greenstein_volume_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); @@ -62,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data); void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data); -void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data); -void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); @@ -149,17 +143,18 @@ public: \ \ void blur(float roughness) \ { \ - bsdf_##svmlower##_blur(&sc, roughness); \ } \ \ float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - return bsdf_##svmlower##_eval_reflect(&sc, omega_out, omega_in, &pdf); \ + pdf = 0; \ + return make_float3(0, 0, 0); \ } \ \ float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \ { \ - return bsdf_##svmlower##_eval_transmit(&sc, omega_out, omega_in, &pdf); \ + pdf = 0; \ + return make_float3(0, 0, 0); \ } \ \ int sample(const float3 &Ng, \ @@ -168,8 +163,8 @@ public: \ float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy, \ float &pdf, float3 &eval) const \ { \ - return bsdf_##svmlower##_sample(&sc, Ng, omega_out, domega_out_dx, domega_out_dy, \ - randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf); \ + pdf = 0; \ + return LABEL_NONE; \ } \ }; \ \ diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 5a658d8244a..9c3134e41c9 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -20,7 +20,6 @@ #ifdef WITH_OSL #include <OSL/oslexec.h> -#include <cmath> #include "util_map.h" #include "util_param.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 54894ea19eb..a9694651e14 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -126,7 +126,7 @@ void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::Texture osl_ts = osl_ts_; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -156,7 +156,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -186,7 +186,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float time) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time) { KernelGlobals *kg = kernel_globals; @@ -218,7 +218,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time) { KernelGlobals *kg = kernel_globals; @@ -250,7 +250,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -275,7 +275,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform) { /* this is only used for shader and object space, we don't really have * a concept of shader space, so we just use object space for both. */ @@ -300,7 +300,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform return false; } -bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from) +bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from) { KernelGlobals *kg = kernel_globals; @@ -328,7 +328,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from) return false; } -bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to) +bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to) { KernelGlobals *kg = kernel_globals; @@ -356,7 +356,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to) return false; } -bool OSLRenderServices::get_array_attribute(void *renderstate, bool derivatives, +bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, TypeDesc type, ustring name, int index, void *val) { @@ -479,7 +479,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val) static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, void *val) { - if(type.basetype == TypeDesc::INT && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) { + if(type.basetype == TypeDesc::STRING && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) { ustring *sval = (ustring *)val; sval[0] = str; @@ -718,7 +718,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_transparent_depth) { - /* Ray Depth */ + /* Transparent Ray Depth */ int f = sd->transparent_depth; return set_attribute_int(f, type, derivatives, val); } @@ -751,14 +751,22 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * return false; } -bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustring object_name, +bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, + TypeDesc type, ustring name, void *val) +{ + if (sg->renderstate == NULL) + return false; + + ShaderData *sd = (ShaderData *)(sg->renderstate); + return get_attribute(sd, derivatives, object_name, type, name, val); +} + +bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - ShaderData *sd = (ShaderData *)renderstate; KernelGlobals *kg = sd->osl_globals; bool is_curve; int object; - // int prim; /* lookup of attribute on another object */ if (object_name != u_empty) { @@ -768,12 +776,10 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri return false; object = it->second; - // prim = PRIM_NONE; is_curve = false; } else { object = sd->object; - // prim = sd->prim; is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0; if (object == OBJECT_NONE) @@ -815,12 +821,12 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri } bool OSLRenderServices::get_userdata(bool derivatives, ustring name, TypeDesc type, - void *renderstate, void *val) + OSL::ShaderGlobals *sg, void *val) { return false; /* disabled by lockgeom */ } -bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, void *renderstate) +bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg) { return false; /* never called by OSL */ } @@ -871,14 +877,30 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options, return true; } #endif + bool status; - OSLThreadData *tdata = kg->osl_tdata; - OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; + if(filename[0] == '@' && filename.find('.') == -1) { + int slot = atoi(filename.c_str() + 1); + float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); - OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + result[0] = rgba[0]; + if(options.nchannels > 1) + result[1] = rgba[1]; + if(options.nchannels > 2) + result[2] = rgba[2]; + if(options.nchannels > 3) + result[3] = rgba[3]; + status = true; + } + else { + OSLThreadData *tdata = kg->osl_tdata; + OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info; - bool status = ts->texture(th, thread_info, - options, s, t, dsdx, dtdx, dsdy, dtdy, result); + OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info); + + status = ts->texture(th, thread_info, + options, s, t, dsdx, dtdx, dsdy, dtdy, result); + } if(!status) { if(options.nchannels == 3 || options.nchannels == 4) { @@ -953,7 +975,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options, return status; } -bool OSLRenderServices::get_texture_info(ustring filename, int subimage, +bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage, ustring dataname, TypeDesc datatype, void *data) { @@ -996,7 +1018,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, ray.P = TO_FLOAT3(P); ray.D = TO_FLOAT3(R); - ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist; + ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist; ray.time = sd->time; if(options.mindist == 0.0f) { @@ -1025,11 +1047,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->sd.osl_globals = sd->osl_globals; /* raytrace */ -#ifdef __HAIR__ return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); -#else - return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect); -#endif } diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 069722d81b6..6f928a0d103 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -49,27 +49,29 @@ public: void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts); - bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); - bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time); - bool get_matrix(OSL::Matrix44 &result, ustring from, float time); - bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time); - bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform); - bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform); - bool get_matrix(OSL::Matrix44 &result, ustring from); - bool get_inverse_matrix(OSL::Matrix44 &result, ustring from); + bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from); + bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from); - bool get_array_attribute(void *renderstate, bool derivatives, + bool get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, TypeDesc type, ustring name, int index, void *val); - bool get_attribute(void *renderstate, bool derivatives, ustring object, + bool get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object, + TypeDesc type, ustring name, void *val); + bool get_attribute(ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val); bool get_userdata(bool derivatives, ustring name, TypeDesc type, - void *renderstate, void *val); - bool has_userdata(ustring name, TypeDesc type, void *renderstate); + OSL::ShaderGlobals *sg, void *val); + bool has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg); int pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 ¢er, float radius, int max_points, bool sort, size_t *out_indices, @@ -106,7 +108,7 @@ public: OSL::ShaderGlobals *sg, const OSL::Vec3 &R, const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result); - bool get_texture_info(ustring filename, int subimage, + bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage, ustring dataname, TypeDesc datatype, void *data); static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name, @@ -157,6 +159,70 @@ public: static ustring u_v; static ustring u_empty; +#if OSL_LIBRARY_VERSION_CODE < 10500 + bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { + return get_matrix(NULL, result, xform, time); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) { + return get_inverse_matrix(NULL, result, xform, time); + } + + bool get_matrix(OSL::Matrix44 &result, ustring from, float time) { + return get_matrix(NULL, result, from, time); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) { + return get_inverse_matrix(NULL, result, to, time); + } + + bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) { + return get_matrix(NULL, result, xform); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) { + return get_inverse_matrix(NULL, result, xform); + } + + bool get_matrix(OSL::Matrix44 &result, ustring from) { + return get_matrix(NULL, result, from); + } + + bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) { + return get_inverse_matrix(NULL, result, to); + } + + bool get_array_attribute(void *renderstate, bool derivatives, + ustring object, TypeDesc type, ustring name, + int index, void *val) { + OSL::ShaderGlobals sg; + sg.renderstate = renderstate; + return get_array_attribute(&sg, derivatives, + object, type, name, + index, val); + } + + bool get_attribute(void *renderstate, bool derivatives, ustring object_name, + TypeDesc type, ustring name, void *val) { + OSL::ShaderGlobals sg; + sg.renderstate = renderstate; + return get_attribute(&sg, derivatives, object_name, type, name, val); + } + + bool has_userdata(ustring name, TypeDesc type, void *renderstate) { + return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate); + } + + bool get_userdata(bool derivatives, ustring name, TypeDesc type, + void *renderstate, void *val) { + return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val); + } + + bool get_texture_info(ustring filename, int subimage, + ustring dataname, TypeDesc datatype, void *data) { + return get_texture_info(NULL, filename, subimage, dataname, datatype, data); + } +#endif private: KernelGlobals *kernel_globals; OSL::TextureSystem *osl_ts; diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 843dcdd0985..ca0c2cc4415 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -14,6 +14,8 @@ * limitations under the License */ +#include <OSL/oslexec.h> + #include "kernel_compat_cpu.h" #include "kernel_montecarlo.h" #include "kernel_types.h" @@ -34,7 +36,6 @@ #include "attribute.h" -#include <OSL/oslexec.h> CCL_NAMESPACE_BEGIN @@ -164,11 +165,14 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, CBSDFClosure *bsdf = (CBSDFClosure *)prim; int scattering = bsdf->scattering(); - /* no caustics option */ - if(scattering == LABEL_GLOSSY && (path_flag & PATH_RAY_DIFFUSE)) { + /* caustic options */ + if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) { KernelGlobals *kg = sd->osl_globals; - if(kernel_data.integrator.no_caustics) + + if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) || + (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) { return; + } } /* sample weight */ @@ -181,12 +185,9 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.T = bsdf->sc.T; sc.data0 = bsdf->sc.data0; sc.data1 = bsdf->sc.data1; + sc.data2 = bsdf->sc.data2; sc.prim = bsdf->sc.prim; -#ifdef __HAIR__ - sc.offset = bsdf->sc.offset; -#endif - /* add */ if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { sd->closure[sd->num_closure++] = sc; @@ -202,6 +203,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_EMISSION_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; /* flag */ @@ -219,6 +221,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_AMBIENT_OCCLUSION_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; if(sd->num_closure < MAX_CLOSURE) { @@ -232,6 +235,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag, sc.type = CLOSURE_HOLDOUT_ID; sc.data0 = 0.0f; sc.data1 = 0.0f; + sc.data2 = 0.0f; sc.prim = NULL; if(sd->num_closure < MAX_CLOSURE) { diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index 5518d652bf9..0b735ede701 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -4,6 +4,7 @@ set(SRC_OSL node_add_closure.osl node_ambient_occlusion.osl + node_anisotropic_bsdf.osl node_attribute.osl node_background.osl node_brick_texture.osl @@ -13,6 +14,7 @@ set(SRC_OSL node_checker_texture.osl node_combine_rgb.osl node_combine_hsv.osl + node_combine_xyz.osl node_convert_from_color.osl node_convert_from_float.osl node_convert_from_int.osl @@ -57,6 +59,7 @@ set(SRC_OSL node_rgb_ramp.osl node_separate_rgb.osl node_separate_hsv.osl + node_separate_xyz.osl node_set_normal.osl node_sky_texture.osl node_subsurface_scattering.osl @@ -71,7 +74,6 @@ set(SRC_OSL node_vector_transform.osl node_velvet_bsdf.osl node_voronoi_texture.osl - node_ward_bsdf.osl node_wavelength.osl node_blackbody.osl node_wave_texture.osl diff --git a/intern/cycles/kernel/shaders/node_ward_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl index 2d360d594f2..da1e4f77107 100644 --- a/intern/cycles/kernel/shaders/node_ward_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl @@ -16,8 +16,9 @@ #include "stdosl.h" -shader node_ward_bsdf( +shader node_anisotropic_bsdf( color Color = 0.0, + string distribution = "GGX", float Roughness = 0.0, float Anisotropy = 0.0, float Rotation = 0.0, @@ -44,6 +45,13 @@ shader node_ward_bsdf( RoughnessV = Roughness / (1.0 - aniso); } - BSDF = Color * ward(Normal, T, RoughnessU, RoughnessV); + if (distribution == "Sharp") + BSDF = Color * reflection(Normal); + else if (distribution == "Beckmann") + BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV); + else if (distribution == "GGX") + BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV); + else + BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV); } diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl index 70a6a6ea7ce..c9fb3542aef 100644 --- a/intern/cycles/kernel/shaders/node_brick_texture.osl +++ b/intern/cycles/kernel/shaders/node_brick_texture.osl @@ -93,6 +93,6 @@ shader node_brick_texture( Col[2] = facm * (Color1[2]) + tint * Color2[2]; } - Color = (Fac == 1.0) ? Mortar: Col; + Color = (Fac == 1.0) ? Mortar : Col; } diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl index 6723076723c..a6d21fd36f3 100644 --- a/intern/cycles/kernel/shaders/node_checker_texture.osl +++ b/intern/cycles/kernel/shaders/node_checker_texture.osl @@ -21,9 +21,9 @@ float checker(point p) { - p[0] = (p[0] + 0.00001) * 0.9999; - p[1] = (p[1] + 0.00001) * 0.9999; - p[2] = (p[2] + 0.00001) * 0.9999; + p[0] = (p[0] + 0.000001) * 0.999999; + p[1] = (p[1] + 0.000001) * 0.999999; + p[2] = (p[2] + 0.000001) * 0.999999; int xi = (int)fabs(floor(p[0])); int yi = (int)fabs(floor(p[1])); diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl new file mode 100644 index 00000000000..933dee5bd78 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include "stdosl.h" + +shader node_combine_xyz( + float X = 0.0, + float Y = 0.0, + float Z = 0.0, + output vector Vector = 0.8) +{ + Vector = vector(X, Y, Z); +} + diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl index 2428da5ef4e..b28d731c19f 100644 --- a/intern/cycles/kernel/shaders/node_emission.osl +++ b/intern/cycles/kernel/shaders/node_emission.osl @@ -17,14 +17,10 @@ #include "stdosl.h" shader node_emission( - int TotalPower = 0, color Color = 0.8, float Strength = 1.0, output closure color Emission = 0) { - if (TotalPower) - Emission = ((Strength / surfacearea()) * Color) * emission(); - else - Emission = (Strength * Color) * emission(); + Emission = (Strength * Color) * emission(); } diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h index 447a84255ef..d192c5d02de 100644 --- a/intern/cycles/kernel/shaders/node_fresnel.h +++ b/intern/cycles/kernel/shaders/node_fresnel.h @@ -34,3 +34,16 @@ float fresnel_dielectric_cos(float cosi, float eta) return result; } +color fresnel_conductor(float cosi, color eta, color k) +{ + color cosi2 = color(cosi * cosi); + color one = color(1, 1, 1); + color tmp_f = eta * eta + k * k; + color tmp = tmp_f * cosi2; + color Rparl2 = (tmp - (2.0 * eta * cosi) + one) / + (tmp + (2.0 * eta * cosi) + one); + color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) / + (tmp_f + (2.0 * eta * cosi) + cosi2); + return (Rparl2 + Rperp2) * 0.5; +} + diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl index dbdf55802ae..cd68f07b21e 100644 --- a/intern/cycles/kernel/shaders/node_geometry.osl +++ b/intern/cycles/kernel/shaders/node_geometry.osl @@ -49,12 +49,8 @@ shader node_geometry( /* try to create spherical tangent from generated coordinates */ if (getattribute("geom:generated", generated)) { - matrix project = matrix(0.0, 1.0, 0.0, 0.0, - -1.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, - 0.5, -0.5, 0.0, 1.0); - - vector T = transform("object", "world", transform(project, generated)); + normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0); + vector T = transform("object", "world", data); Tangent = cross(Normal, normalize(cross(T, Normal))); } else { diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl index b4e0fe62223..5c727ca6917 100644 --- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl @@ -19,7 +19,7 @@ shader node_glossy_bsdf( color Color = 0.8, - string distribution = "Beckmann", + string distribution = "GGX", float Roughness = 0.2, normal Normal = N, output closure color BSDF = 0) @@ -30,6 +30,8 @@ shader node_glossy_bsdf( BSDF = Color * microfacet_beckmann(Normal, Roughness); else if (distribution == "GGX") BSDF = Color * microfacet_ggx(Normal, Roughness); + else + BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness); } diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl index 7238a1e8862..18b5fb4b31f 100644 --- a/intern/cycles/kernel/shaders/node_image_texture.osl +++ b/intern/cycles/kernel/shaders/node_image_texture.osl @@ -113,6 +113,10 @@ shader node_image_texture( weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0); } } + else { + /* Desperate mode, no valid choice anyway, fallback to one side.*/ + weight[0] = 1.0; + } Color = color(0.0, 0.0, 0.0); Alpha = 0.0; diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl index 60762539002..a32c3d4b1b8 100644 --- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl +++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl @@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float int i; for (i = 0; i < (int)octaves; i++) { - value += safe_noise(p, 0) * pwr; + value += safe_noise(p, "signed") * pwr; pwr *= pwHL; p *= lacunarity; } rmd = octaves - floor(octaves); if (rmd != 0.0) - value += rmd * safe_noise(p, 0) * pwr; + value += rmd * safe_noise(p, "signed") * pwr; return value; } @@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar int i; for (i = 0; i < (int)octaves; i++) { - value *= (pwr * safe_noise(p, 0) + 1.0); + value *= (pwr * safe_noise(p, "signed") + 1.0); pwr *= pwHL; p *= lacunarity; } rmd = octaves - floor(octaves); if (rmd != 0.0) - value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */ + value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */ return value; } @@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna int i; /* first unscaled octave of function; later octaves are scaled */ - value = offset + safe_noise(p, 0); + value = offset + safe_noise(p, "signed"); p *= lacunarity; for (i = 1; i < (int)octaves; i++) { - increment = (safe_noise(p, 0) + offset) * pwr * value; + increment = (safe_noise(p, "signed") + offset) * pwr * value; value += increment; pwr *= pwHL; p *= lacunarity; @@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna rmd = octaves - floor(octaves); if (rmd != 0.0) { - increment = (safe_noise(p, 0) + offset) * pwr * value; + increment = (safe_noise(p, "signed") + offset) * pwr * value; value += rmd * increment; } @@ -126,7 +126,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, float pwr = pwHL; int i; - result = safe_noise(p, 0) + offset; + result = safe_noise(p, "signed") + offset; weight = gain * result; p *= lacunarity; @@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, if (weight > 1.0) weight = 1.0; - signal = (safe_noise(p, 0) + offset) * pwr; + signal = (safe_noise(p, "signed") + offset) * pwr; pwr *= pwHL; result += weight * signal; weight *= gain * signal; @@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H, rmd = octaves - floor(octaves); if (rmd != 0.0) - result += rmd * ((safe_noise(p, 0) + offset) * pwr); + result += rmd * ((safe_noise(p, "signed") + offset) * pwr); return result; } @@ -164,7 +164,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H, float pwr = pwHL; int i; - signal = offset - fabs(safe_noise(p, 0)); + signal = offset - fabs(safe_noise(p, "signed")); signal *= signal; result = signal; weight = 1.0; @@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H, for (i = 1; i < (int)octaves; i++) { p *= lacunarity; weight = clamp(signal * gain, 0.0, 1.0); - signal = offset - fabs(safe_noise(p, 0)); + signal = offset - fabs(safe_noise(p, "signed")); signal *= signal; signal *= weight; result += signal * pwr; diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl new file mode 100644 index 00000000000..63725cb9995 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include "stdosl.h" + +shader node_separate_xyz( + vector Vector = 0.8, + output float X = 0.0, + output float Y = 0.0, + output float Z = 0.0) +{ + X = Vector[0]; + Y = Vector[1]; + Z = Vector[2]; +} diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h index de51559f297..2710eed414a 100644 --- a/intern/cycles/kernel/shaders/node_texture.h +++ b/intern/cycles/kernel/shaders/node_texture.h @@ -153,12 +153,12 @@ float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; } /* Noise Bases */ -float safe_noise(point p, int type) +float safe_noise(point p, string type) { float f = 0.0; /* Perlin noise in range -1..1 */ - if (type == 0) + if (type == "signed") f = noise("perlin", p); /* Perlin noise in range 0..1 */ @@ -175,7 +175,7 @@ float safe_noise(point p, int type) float noise_basis(point p, string basis) { if (basis == "Perlin") - return safe_noise(p, 1); + return safe_noise(p, "unsigned"); if (basis == "Voronoi F1") return voronoi_F1S(p); if (basis == "Voronoi F2") diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index 6f824ea8ebd..1ff8f363b49 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -476,17 +476,17 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN; closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN; closure color diffuse_toon(normal N, float size, float smooth) BUILTIN; closure color glossy_toon(normal N, float size, float smooth) BUILTIN; -closure color westin_backscatter(normal N, float roughness) BUILTIN; -closure color westin_sheen(normal N, float edginess) BUILTIN; closure color translucent(normal N) BUILTIN; closure color reflection(normal N) BUILTIN; closure color refraction(normal N, float eta) BUILTIN; closure color transparent() BUILTIN; closure color microfacet_ggx(normal N, float ag) BUILTIN; +closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; +closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; -closure color ward(normal N, vector T,float ax, float ay) BUILTIN; +closure color ashikhmin_shirley(normal N, vector T,float ax, float ay) BUILTIN; closure color ashikhmin_velvet(normal N, float sigma) BUILTIN; closure color emission() BUILTIN; closure color background() BUILTIN; @@ -505,12 +505,8 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve closure color henyey_greenstein(float g) BUILTIN; closure color absorption() BUILTIN; -// Backwards compatibility -closure color bssrdf_cubic(normal N, vector radius) BUILTIN; -closure color bssrdf_gaussian(normal N, vector radius) BUILTIN; -closure color specular_toon(normal N, float size, float smooth) BUILTIN; - // Renderer state +int backfacing () BUILTIN; int raytype (string typename) BUILTIN; // the individual 'isFOOray' functions are deprecated int iscameraray () { return raytype("camera"); } diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index dbf59c60cb0..c13eae813d6 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -167,8 +167,8 @@ CCL_NAMESPACE_END #include "svm_math.h" #include "svm_mix.h" #include "svm_ramp.h" -#include "svm_sepcomb_rgb.h" #include "svm_sepcomb_hsv.h" +#include "svm_sepcomb_vector.h" #include "svm_musgrave.h" #include "svm_sky.h" #include "svm_tex_coord.h" @@ -236,7 +236,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade if(stack_load_float(stack, node.z) == 1.0f) offset += node.y; break; -#ifdef __IMAGE_TEXTURES__ +#ifdef __TEXTURES__ case NODE_TEX_IMAGE: svm_node_tex_image(kg, sd, stack, node); break; @@ -246,8 +246,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_TEX_ENVIRONMENT: svm_node_tex_environment(kg, sd, stack, node); break; -#endif -#ifdef __PROCEDURAL_TEXTURES__ case NODE_TEX_SKY: svm_node_tex_sky(kg, sd, stack, node, &offset); break; @@ -327,11 +325,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_MIX: svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset); break; - case NODE_SEPARATE_RGB: - svm_node_separate_rgb(sd, stack, node.y, node.z, node.w); + case NODE_SEPARATE_VECTOR: + svm_node_separate_vector(sd, stack, node.y, node.z, node.w); break; - case NODE_COMBINE_RGB: - svm_node_combine_rgb(sd, stack, node.y, node.z, node.w); + case NODE_COMBINE_VECTOR: + svm_node_combine_vector(sd, stack, node.y, node.z, node.w); break; case NODE_SEPARATE_HSV: svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); @@ -407,12 +405,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade break; case NODE_CLOSURE_SET_NORMAL: svm_node_set_normal(kg, sd, stack, node.y, node.z ); - break; -#endif - case NODE_EMISSION_SET_WEIGHT_TOTAL: - svm_node_emission_set_weight_total(kg, sd, node.y, node.z, node.w); break; -#ifdef __EXTRA_NODES__ case NODE_RGB_RAMP: svm_node_rgb_ramp(kg, sd, stack, node, &offset); break; @@ -425,17 +418,13 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade case NODE_LIGHT_FALLOFF: svm_node_light_falloff(sd, stack, node); break; -#endif -#ifdef __ANISOTROPIC__ +#endif case NODE_TANGENT: svm_node_tangent(kg, sd, stack, node); break; -#endif -#ifdef __NORMAL_MAP__ case NODE_NORMAL_MAP: svm_node_normal_map(kg, sd, stack, node); - break; -#endif + break; case NODE_END: default: return; diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index 63dbf27d35e..1e40e868e14 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -42,7 +42,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta /* Input */ float temperature = stack_load_float(stack, temperature_offset); - if (temperature < BB_DRAPPER) { + if (temperature < BB_DRAPER) { /* just return very very dim red */ color_rgb = make_float3(1.0e-6f,0.0f,0.0f); } @@ -53,9 +53,9 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta /* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */ - float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER)); + float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER)); - int blackbody_table_offset = kernel_data.blackbody.table_offset; + int blackbody_table_offset = kernel_data.tables.blackbody_offset; /* Retrieve colors from the lookup table */ float lutval = t*lookuptablenormalize; diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h index 8d1a1a40449..e0408ad334a 100644 --- a/intern/cycles/kernel/svm/svm_checker.h +++ b/intern/cycles/kernel/svm/svm_checker.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline float svm_checker(float3 p) { /* avoid precision issues on unit coordinates */ - p.x = (p.x + 0.00001f)*0.9999f; - p.y = (p.y + 0.00001f)*0.9999f; - p.z = (p.z + 0.00001f)*0.9999f; + p.x = (p.x + 0.000001f)*0.999999f; + p.y = (p.y + 0.000001f)*0.999999f; + p.z = (p.z + 0.000001f)*0.999999f; int xi = float_to_int(fabsf(floorf(p.x))); int yi = float_to_int(fabsf(floorf(p.y))); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index a3770877544..30110db3ef9 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -24,6 +24,7 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type if(refract) { sc->data0 = eta; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_refraction_setup(sc); } else @@ -31,7 +32,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { sc->data0 = roughness; - sc->data1 = eta; + sc->data1 = roughness; + sc->data2 = eta; if(refract) sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); @@ -40,7 +42,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type } else { sc->data0 = roughness; - sc->data1 = eta; + sc->data1 = roughness; + sc->data2 = eta; if(refract) sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); @@ -135,11 +138,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(roughness == 0.0f) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_diffuse_setup(sc); } else { sc->data0 = roughness; sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_oren_nayar_setup(sc); } } @@ -151,6 +156,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sc->N = N; sd->flag |= bsdf_translucent_setup(sc); } @@ -162,6 +168,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->data0 = 0.0f; sc->data1 = 0.0f; + sc->data2 = 0.0f; sc->N = N; sd->flag |= bsdf_transparent_setup(sc); } @@ -169,9 +176,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * } case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: - case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: { + case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); @@ -179,15 +187,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->N = N; sc->data0 = param1; - sc->data1 = 0.0f; + sc->data1 = param1; + sc->data2 = 0.0f; /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) sd->flag |= bsdf_reflection_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) sd->flag |= bsdf_microfacet_beckmann_setup(sc); - else + else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) sd->flag |= bsdf_microfacet_ggx_setup(sc); + else + sd->flag |= bsdf_ashikhmin_shirley_setup(sc); } break; @@ -196,25 +207,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { sc->N = N; - sc->data0 = param1; float eta = fmaxf(param2, 1e-5f); - sc->data1 = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ - if(type == CLOSURE_BSDF_REFRACTION_ID) + if(type == CLOSURE_BSDF_REFRACTION_ID) { + sc->data0 = eta; + sc->data1 = 0.0f; + sc->data2 = 0.0f; + sd->flag |= bsdf_refraction_setup(sc); - else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); - else - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + } + else { + sc->data0 = param1; + sc->data1 = param1; + sc->data2 = eta; + + if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + else + sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + } } break; @@ -223,8 +244,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && + !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) { break; + } #endif /* index of refraction */ float eta = fmaxf(param2, 1e-5f); @@ -241,12 +264,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float sample_weight = sc->sample_weight; sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel); - - if(sc) { - sc->N = N; - svm_node_glass_setup(sd, sc, type, eta, roughness, false); +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + if(sc) { + sc->N = N; + svm_node_glass_setup(sd, sc, type, eta, roughness, false); + } } +#ifdef __CAUSTICS_TRICKS__ + if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) + break; +#endif + /* refraction */ sc = &sd->closure[sd->num_closure]; sc->weight = weight; @@ -261,9 +293,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } - case CLOSURE_BSDF_WARD_ID: { + case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: { #ifdef __CAUSTICS_TRICKS__ - if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE)) + if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); @@ -271,7 +305,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { sc->N = N; -#ifdef __ANISOTROPIC__ sc->T = stack_load_float3(stack, data_node.y); /* rotate tangent */ @@ -293,10 +326,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = roughness/(1.0f - anisotropy); } - sd->flag |= bsdf_ward_setup(sc); -#else - sd->flag |= bsdf_diffuse_setup(sc); -#endif + sc->data2 = 0.0f; + + if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc); + else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) + sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc); + else + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc); } break; } @@ -309,6 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* sigma */ sc->data0 = clamp(param1, 0.0f, 1.0f); sc->data1 = 0.0f; + sc->data2 = 0.0f; sd->flag |= bsdf_ashikhmin_velvet_setup(sc); } break; @@ -322,6 +360,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; sc->data0 = param1; sc->data1 = param2; + sc->data2 = 0.0f; if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID) sd->flag |= bsdf_diffuse_toon_setup(sc); @@ -339,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(sc) { /* todo: giving a fixed weight here will cause issues when - * mixing multiple BSDFS. energey will not be conserved and + * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we * better figure out a way to skip backfaces from rays * spawned by transmission from the front */ @@ -356,11 +395,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; sc->data0 = param1; sc->data1 = param2; - sc->offset = -stack_load_float(stack, data_node.z); + sc->data2 = -stack_load_float(stack, data_node.z); if(!(sd->type & PRIMITIVE_ALL_CURVE)) { sc->T = normalize(sd->dPdv); - sc->offset = 0.0f; + sc->data2 = 0.0f; } else sc->T = sd->dPdu; @@ -405,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.x; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -421,6 +461,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.y; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -437,6 +478,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->sample_weight = sample_weight; sc->data0 = radius.z; sc->data1 = texture_blur; + sc->data2 = 0.0f; sc->T.x = sharpness; #ifdef __OSL__ sc->prim = NULL; @@ -582,16 +624,6 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint svm_node_closure_store_weight(sd, weight); } -ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b) -{ - float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b)); - - if(sd->object != OBJECT_NONE) - weight /= object_surface_area(kg, sd->object); - - svm_node_closure_store_weight(sd, weight); -} - ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset) { float3 weight = stack_load_float3(stack, weight_offset); @@ -603,14 +635,10 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, floa { uint color_offset = node.y; uint strength_offset = node.z; - uint total_power = node.w; float strength = stack_load_float(stack, strength_offset); float3 weight = stack_load_float3(stack, color_offset)*strength; - if(total_power && sd->object != OBJECT_NONE) - weight /= object_surface_area(kg, sd->object); - svm_node_closure_store_weight(sd, weight); } diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h index 2503912c5c6..b221e0728ec 100644 --- a/intern/cycles/kernel/svm/svm_convert.h +++ b/intern/cycles/kernel/svm/svm_convert.h @@ -45,13 +45,13 @@ ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint f } case NODE_CONVERT_VF: { float3 f = stack_load_float3(stack, from); - float g = (f.x + f.y + f.z)*(1.0f/3.0f); + float g = average(f); stack_store_float(stack, to, g); break; } case NODE_CONVERT_VI: { float3 f = stack_load_float3(stack, from); - int i = (int)((f.x + f.y + f.z)*(1.0f/3.0f)); + int i = (int)average(f); stack_store_int(stack, to, i); break; } diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index daf7c6652d2..8a256c9bda5 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -134,8 +134,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, { #ifdef __KERNEL_CPU__ #ifdef __KERNEL_SSE2__ - __m128 r_m128; - float4 &r = (float4 &)r_m128; + ssef r_ssef; + float4 &r = (float4 &)r_ssef; r = kernel_tex_image_interp(id, x, y); #else float4 r = kernel_tex_image_interp(id, x, y); @@ -252,9 +252,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break; case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break; case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; - case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break; case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break; case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break; @@ -318,14 +318,14 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_m128 = _mm_div_ps(r_m128, _mm_set1_ps(alpha)); + r_ssef = r_ssef / ssef(alpha); if(id >= TEX_NUM_FLOAT_IMAGES) - r_m128 = _mm_min_ps(r_m128, _mm_set1_ps(1.0f)); + r_ssef = min(r_ssef, ssef(1.0f)); r.w = alpha; } if(srgb) { - r_m128 = color_srgb_to_scene_linear(r_m128); + r_ssef = color_srgb_to_scene_linear(r_ssef); r.w = alpha; } #else @@ -435,6 +435,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f); } } + else { + /* Desperate mode, no valid choice anyway, fallback to one side.*/ + weight.x = 1.0f; + } /* now fetch textures */ uint co_offset, out_offset, alpha_offset, srgb; diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 91dda8972f9..c77c2a1c482 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -38,11 +38,11 @@ ccl_device int quick_floor(float x) return float_to_int(x) - ((x < 0) ? 1 : 0); } #else -ccl_device_inline __m128i quick_floor_sse(const __m128& x) +ccl_device_inline ssei quick_floor_sse(const ssef& x) { - __m128i b = _mm_cvttps_epi32(x); - __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f))); - return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1 + ssei b = truncatei(x); + ssei isneg = cast((x < ssef(0.0f)).m128); + return b + isneg; // unsaturated add 0xffffffff is the same as subtract -1 } #endif @@ -52,9 +52,9 @@ ccl_device float bits_to_01(uint bits) return bits * (1.0f/(float)0xFFFFFFFF); } #else -ccl_device_inline __m128 bits_to_01_sse(const __m128i& bits) +ccl_device_inline ssef bits_to_01_sse(const ssei& bits) { - return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF)); + return uint32_to_float(bits) * ssef(1.0f/(float)0xFFFFFFFF); } #endif @@ -88,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz) } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128i hash_sse(const __m128i& kx, const __m128i& ky, const __m128i& kz) +ccl_device_inline ssei hash_sse(const ssei& kx, const ssei& ky, const ssei& kz) { -#define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k))) -#define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0) +#define rot(x,k) (((x)<<(k)) | (srl(x, 32-(k)))) +#define xor_rot(a, b, c) do {a = a^b; a = a - rot(b, c);} while(0) uint len = 3; - __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13); - __m128i a = _mm_add_epi32(magic, kx); - __m128i b = _mm_add_epi32(magic, ky); - __m128i c = _mm_add_epi32(magic, kz); + ssei magic = ssei(0xdeadbeef + (len << 2) + 13); + ssei a = magic + kx; + ssei b = magic + ky; + ssei c = magic + kz; xor_rot(c, b, 14); xor_rot(a, c, 11); @@ -133,10 +133,10 @@ ccl_device float floorfrac(float x, int* i) return x - *i; } #else -ccl_device_inline __m128 floorfrac_sse(const __m128& x, __m128i *i) +ccl_device_inline ssef floorfrac_sse(const ssef& x, ssei *i) { *i = quick_floor_sse(x); - return _mm_sub_ps(x, _mm_cvtepi32_ps(*i)); + return x - ssef(*i); } #endif @@ -146,11 +146,11 @@ ccl_device float fade(float t) return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f); } #else -ccl_device_inline __m128 fade_sse(const __m128 *t) +ccl_device_inline ssef fade_sse(const ssef *t) { - __m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f)); - __m128 b = fma(*t, a, _mm_set1_ps(10.0f)); - return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b)); + ssef a = madd(*t, ssef(6.0f), ssef(-15.0f)); + ssef b = madd(*t, a, ssef(10.0f)); + return ((*t) * (*t)) * ((*t) * b); } #endif @@ -160,10 +160,10 @@ ccl_device float nerp(float t, float a, float b) return (1.0f - t) * a + t * b; } #else -ccl_device_inline __m128 nerp_sse(const __m128& t, const __m128& a, const __m128& b) +ccl_device_inline ssef nerp_sse(const ssef& t, const ssef& a, const ssef& b) { - __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a); - return fma(t, b, x1); + ssef x1 = (ssef(1.0f) - t) * a; + return madd(t, b, x1); } #endif @@ -178,35 +178,35 @@ ccl_device float grad(int hash, float x, float y, float z) return ((h&1) ? -u : u) + ((h&2) ? -v : v); } #else -ccl_device_inline __m128 grad_sse(const __m128i& hash, const __m128& x, const __m128& y, const __m128& z) +ccl_device_inline ssef grad_sse(const ssei& hash, const ssef& x, const ssef& y, const ssef& z) { - __m128i c1 = _mm_set1_epi32(1); - __m128i c2 = _mm_set1_epi32(2); + ssei c1 = ssei(1); + ssei c2 = ssei(2); - __m128i h = _mm_and_si128(hash, _mm_set1_epi32(15)); // h = hash & 15 + ssei h = hash & ssei(15); // h = hash & 15 - __m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8)); // 0xffffffff if h < 8 else 0 + sseb case_ux = h < ssei(8); // 0xffffffff if h < 8 else 0 - __m128 u = blend(_mm_castsi128_ps(case_ux), x, y); // u = h<8 ? x : y + ssef u = select(case_ux, x, y); // u = h<8 ? x : y - __m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4)); // 0xffffffff if h < 4 else 0 + sseb case_vy = h < ssei(4); // 0xffffffff if h < 4 else 0 - __m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12)); // 0xffffffff if h == 12 else 0 - __m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14)); // 0xffffffff if h == 14 else 0 + sseb case_h12 = h == ssei(12); // 0xffffffff if h == 12 else 0 + sseb case_h14 = h == ssei(14); // 0xffffffff if h == 14 else 0 - __m128i case_vx = _mm_or_si128(case_h12, case_h14); // 0xffffffff if h == 12 or h == 14 else 0 + sseb case_vx = case_h12 | case_h14; // 0xffffffff if h == 12 or h == 14 else 0 - __m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z + ssef v = select(case_vy, y, select(case_vx, x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z - __m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31); // 1<<31 if h&1 else 0 - __m128 case_uneg_mask = _mm_castsi128_ps(case_uneg); // -0.0 if h&1 else +0.0 - __m128 ru = _mm_xor_ps(u, case_uneg_mask); // -u if h&1 else u (copy float sign) + ssei case_uneg = (h & c1) << 31; // 1<<31 if h&1 else 0 + ssef case_uneg_mask = cast(case_uneg); // -0.0 if h&1 else +0.0 + ssef ru = u ^ case_uneg_mask; // -u if h&1 else u (copy float sign) - __m128i case_vneg = _mm_slli_epi32(_mm_and_si128(h, c2), 30); // 2<<30 if h&2 else 0 - __m128 case_vneg_mask = _mm_castsi128_ps(case_vneg); // -0.0 if h&2 else +0.0 - __m128 rv = _mm_xor_ps(v, case_vneg_mask); // -v if h&2 else v (copy float sign) + ssei case_vneg = (h & c2) << 30; // 2<<30 if h&2 else 0 + ssef case_vneg_mask = cast(case_vneg); // -0.0 if h&2 else +0.0 + ssef rv = v ^ case_vneg_mask; // -v if h&2 else v (copy float sign) - __m128 r = _mm_add_ps(ru, rv); // ((h&1) ? -u : u) + ((h&2) ? -v : v) + ssef r = ru + rv; // ((h&1) ? -u : u) + ((h&2) ? -v : v) return r; } #endif @@ -217,9 +217,9 @@ ccl_device float scale3(float result) return 0.9820f * result; } #else -ccl_device_inline __m128 scale3_sse(const __m128& result) +ccl_device_inline ssef scale3_sse(const ssef& result) { - return _mm_mul_ps(_mm_set1_ps(0.9820f), result); + return ssef(0.9820f) * result; } #endif @@ -252,75 +252,41 @@ ccl_device_noinline float perlin(float x, float y, float z) #else ccl_device_noinline float perlin(float x, float y, float z) { - __m128 xyz = _mm_setr_ps(x, y, z, 0.0f); - __m128i XYZ; + ssef xyz = ssef(x, y, z, 0.0f); + ssei XYZ; - __m128 fxyz = floorfrac_sse(xyz, &XYZ); + ssef fxyz = floorfrac_sse(xyz, &XYZ); - __m128 uvw = fade_sse(&fxyz); - __m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw); + ssef uvw = fade_sse(&fxyz); + ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw); - __m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1)); - __m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 - __m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 + ssei XYZ_ofc = XYZ + ssei(1); + ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 + ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 - __m128i h1 = hash_sse(broadcast<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 - __m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 + ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 + ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 - __m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f)); - __m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); - __m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); + ssef fxyz_ofc = fxyz - ssef(1.0f); + ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); + ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); - __m128 g1 = grad_sse(h1, broadcast<0>(fxyz), vfy, vfz); - __m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz); - __m128 n1 = nerp_sse(u, g1, g2); + ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz); + ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz); + ssef n1 = nerp_sse(u, g1, g2); - __m128 n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector - __m128 n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] + ssef n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector + ssef n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] - __m128 n2_second = broadcast<1>(n2); // extract b to a separate vector - __m128 result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] + ssef n2_second = shuffle<1>(n2); // extract b to a separate vector + ssef result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] - __m128 r = scale3_sse(result); + ssef r = scale3_sse(result); - __m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000)); - __m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0 - __m128 rfinite = _mm_andnot_ps(rinfmask, r); // 0 if r is inf/-inf/nan else r - return _mm_cvtss_f32(rfinite); -} -#endif - -#if 0 // unused -ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod) -{ - int X; float fx = floorfrac(x, &X); - int Y; float fy = floorfrac(y, &Y); - int Z; float fz = floorfrac(z, &Z); - - int3 p; - - p.x = max(quick_floor(pperiod.x), 1); - p.y = max(quick_floor(pperiod.y), 1); - p.z = max(quick_floor(pperiod.z), 1); - - float u = fade(fx); - float v = fade(fy); - float w = fade(fz); - - float result; - - result = nerp (w, nerp (v, nerp (u, grad (phash (X , Y , Z , p), fx , fy , fz ), - grad (phash (X+1, Y , Z , p), fx-1.0f, fy , fz )), - nerp (u, grad (phash (X , Y+1, Z , p), fx , fy-1.0f, fz ), - grad (phash (X+1, Y+1, Z , p), fx-1.0f, fy-1.0f, fz ))), - nerp (v, nerp (u, grad (phash (X , Y , Z+1, p), fx , fy , fz-1.0f ), - grad (phash (X+1, Y , Z+1, p), fx-1.0f, fy , fz-1.0f )), - nerp (u, grad (phash (X , Y+1, Z+1, p), fx , fy-1.0f, fz-1.0f ), - grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f )))); - float r = scale3(result); - - /* can happen for big coordinates, things even out to 0.0 then anyway */ - return (isfinite(r))? r: 0.0f; + ssef infmask = cast(ssei(0x7f800000)); + ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0 + ssef rfinite = andnot(rinfmask, r); // 0 if r is inf/-inf/nan else r + return extract<0>(rfinite); } #endif @@ -357,30 +323,15 @@ ccl_device float3 cellnoise_color(float3 p) return make_float3(r, g, b); } #else -ccl_device __m128 cellnoise_color(const __m128& p) +ccl_device ssef cellnoise_color(const ssef& p) { - __m128i ip = quick_floor_sse(p); - __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip); - __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip); - __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip); + ssei ip = quick_floor_sse(p); + ssei ip_yxz = shuffle<1, 0, 2, 3>(ip); + ssei ip_xyy = shuffle<0, 1, 1, 3>(ip); + ssei ip_zzx = shuffle<2, 2, 0, 3>(ip); return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx)); } #endif -#if 0 // unused -/* periodic perlin noise in range 0..1 */ -ccl_device float pnoise(float3 p, float3 pperiod) -{ - float r = perlin_periodic(p.x, p.y, p.z, pperiod); - return 0.5f*r + 0.5f; -} - -/* periodic perlin noise in range -1..1 */ -ccl_device float psnoise(float3 p, float3 pperiod) -{ - return perlin_periodic(p.x, p.y, p.z, pperiod); -} -#endif - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h deleted file mode 100644 index 34c4449ecdb..00000000000 --- a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset) -{ - float color = stack_load_float(stack, in_offset); - - if (stack_valid(out_offset)) - stack_store_float(stack, out_offset+color_index, color); -} - -ccl_device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset) -{ - float3 color = stack_load_float3(stack, icolor_offset); - - if (stack_valid(out_offset)) { - if (color_index == 0) - stack_store_float(stack, out_offset, color.x); - else if (color_index == 1) - stack_store_float(stack, out_offset, color.y); - else - stack_store_float(stack, out_offset, color.z); - } -} - -CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h new file mode 100644 index 00000000000..c8e7e34f87d --- /dev/null +++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h @@ -0,0 +1,44 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +CCL_NAMESPACE_BEGIN + +/* Vector combine / separate, used for the RGB and XYZ nodes */ + +ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_offset, uint vector_index, uint out_offset) +{ + float vector = stack_load_float(stack, in_offset); + + if (stack_valid(out_offset)) + stack_store_float(stack, out_offset+vector_index, vector); +} + +ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivector_offset, uint vector_index, uint out_offset) +{ + float3 vector = stack_load_float3(stack, ivector_offset); + + if (stack_valid(out_offset)) { + if (vector_index == 0) + stack_store_float(stack, out_offset, vector.x); + else if (vector_index == 1) + stack_store_float(stack, out_offset, vector.y); + else + stack_store_float(stack, out_offset, vector.z); + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index 5fd9204cbf6..d97c85db36a 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -140,15 +140,15 @@ ccl_device float voronoi_F1_distance(float3 p) } } #else - __m128 vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); da = min(d, da); } } @@ -184,15 +184,15 @@ ccl_device float3 voronoi_F1_color(float3 p) return cellnoise_color(pa); #else - __m128 pa, vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef pa, vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); if(d < da) { da = d; @@ -202,7 +202,7 @@ ccl_device float3 voronoi_F1_color(float3 p) } } - __m128 color = cellnoise_color(pa); + ssef color = cellnoise_color(pa); return (float3 &)color; #endif } diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 80972ec82bc..fbe669c1fab 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -72,15 +72,14 @@ typedef enum NodeType { NODE_TEX_COORD, NODE_TEX_COORD_BUMP_DX, NODE_TEX_COORD_BUMP_DY, - NODE_EMISSION_SET_WEIGHT_TOTAL, NODE_ATTR_BUMP_DX, NODE_ATTR_BUMP_DY, NODE_TEX_ENVIRONMENT, NODE_CLOSURE_HOLDOUT, NODE_LAYER_WEIGHT, NODE_CLOSURE_VOLUME, - NODE_SEPARATE_RGB, - NODE_COMBINE_RGB, + NODE_SEPARATE_VECTOR, + NODE_COMBINE_VECTOR, NODE_SEPARATE_HSV, NODE_COMBINE_HSV, NODE_HSV, @@ -349,7 +348,6 @@ typedef enum ClosureType { /* Diffuse */ CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, - CLOSURE_BSDF_WESTIN_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, @@ -358,9 +356,11 @@ typedef enum ClosureType { CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, - CLOSURE_BSDF_WARD_ID, + CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, + CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, - CLOSURE_BSDF_WESTIN_BACKSCATTER_ID, CLOSURE_BSDF_PHONG_RAMP_ID, CLOSURE_BSDF_GLOSSY_TOON_ID, CLOSURE_BSDF_HAIR_REFLECTION_ID, @@ -404,7 +404,7 @@ typedef enum ClosureType { #define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) -#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type == CLOSURE_BSDF_WARD_ID) +#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_GAUSSIAN_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 449c1391980..c3907da39d0 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -7,6 +7,7 @@ set(INC ../kernel/osl ../bvh ../util + ../../glew-mx ) set(INC_SYS @@ -76,5 +77,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}") include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_render ${SRC} ${SRC_HEADERS}) +add_definitions(${GL_DEFINITIONS}) +add_library(cycles_render ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp index 14805b6f11a..8abf869a775 100644 --- a/intern/cycles/render/attribute.cpp +++ b/intern/cycles/render/attribute.cpp @@ -69,6 +69,15 @@ void Attribute::add(const float& f) buffer.push_back(data[i]); } +void Attribute::add(const uchar4& f) +{ + char *data = (char*)&f; + size_t size = sizeof(f); + + for(size_t i = 0; i < size; i++) + buffer.push_back(data[i]); +} + void Attribute::add(const float3& f) { char *data = (char*)&f; @@ -136,6 +145,7 @@ size_t Attribute::element_size(int numverts, int numtris, int numsteps, int numc size = numtris; break; case ATTR_ELEMENT_CORNER: + case ATTR_ELEMENT_CORNER_BYTE: size = numtris*3; break; case ATTR_ELEMENT_CURVE: @@ -263,11 +273,19 @@ Attribute *AttributeSet::add(ustring name, TypeDesc type, AttributeElement eleme remove(name); } - attributes.push_back(Attribute()); +#if __cplusplus >= 201103L + attributes.emplace_back(); attr = &attributes.back(); - attr->set(name, type, element); - +#else + { + Attribute attr_temp; + attr_temp.set(name, type, element); + attributes.push_back(attr_temp); + attr = &attributes.back(); + } +#endif + /* this is weak .. */ if(triangle_mesh) attr->reserve(triangle_mesh->verts.size(), triangle_mesh->triangles.size(), triangle_mesh->motion_steps, 0, 0, resize); diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h index 9fc32db8444..f5227ebde52 100644 --- a/intern/cycles/render/attribute.h +++ b/intern/cycles/render/attribute.h @@ -68,6 +68,7 @@ public: float3 *data_float3() { return (float3*)data(); } float4 *data_float4() { return (float4*)data(); } float *data_float() { return (float*)data(); } + uchar4 *data_uchar4() { return (uchar4*)data(); } Transform *data_transform() { return (Transform*)data(); } VoxelAttribute *data_voxel() { return ( VoxelAttribute*)data(); } @@ -80,6 +81,7 @@ public: void add(const float& f); void add(const float3& f); + void add(const uchar4& f); void add(const Transform& f); void add(const VoxelAttribute& f); void add(const char *data); diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp index a877c52fbed..3926ecb99d6 100644 --- a/intern/cycles/render/background.cpp +++ b/intern/cycles/render/background.cpp @@ -78,6 +78,8 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene kbackground->surface_shader |= SHADER_EXCLUDE_GLOSSY; if(!(visibility & PATH_RAY_TRANSMIT)) kbackground->surface_shader |= SHADER_EXCLUDE_TRANSMIT; + if(!(visibility & PATH_RAY_VOLUME_SCATTER)) + kbackground->surface_shader |= SHADER_EXCLUDE_SCATTER; if(!(visibility & PATH_RAY_CAMERA)) kbackground->surface_shader |= SHADER_EXCLUDE_CAMERA; diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index aa317ab672f..5723a22dd84 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -15,10 +15,11 @@ */ #include "bake.h" +#include "integrator.h" CCL_NAMESPACE_BEGIN -BakeData::BakeData(const int object, const int tri_offset, const int num_pixels): +BakeData::BakeData(const int object, const size_t tri_offset, const size_t num_pixels): m_object(object), m_tri_offset(tri_offset), m_num_pixels(num_pixels) @@ -59,7 +60,7 @@ int BakeData::object() return m_object; } -int BakeData::size() +size_t BakeData::size() { return m_num_pixels; } @@ -94,6 +95,7 @@ BakeManager::BakeManager() m_bake_data = NULL; m_is_baking = false; need_update = true; + m_shader_limit = 512 * 512; } BakeManager::~BakeManager() @@ -112,75 +114,105 @@ void BakeManager::set_baking(const bool value) m_is_baking = value; } -BakeData *BakeManager::init(const int object, const int tri_offset, const int num_pixels) +BakeData *BakeManager::init(const int object, const size_t tri_offset, const size_t num_pixels) { m_bake_data = new BakeData(object, tri_offset, num_pixels); return m_bake_data; } +void BakeManager::set_shader_limit(const size_t x, const size_t y) +{ + m_shader_limit = x * y; + m_shader_limit = (size_t)pow(2, ceil(log(m_shader_limit)/log(2))); +} + bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[]) { - size_t limit = bake_data->size(); + size_t num_pixels = bake_data->size(); + + progress.reset_sample(); + this->num_parts = 0; + + /* calculate the total parts for the progress bar */ + for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) { + size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit); - /* setup input for device task */ - device_vector<uint4> d_input; - uint4 *d_input_data = d_input.resize(limit * 2); - size_t d_input_size = 0; + DeviceTask task(DeviceTask::SHADER); + task.shader_w = shader_size; - for(size_t i = 0; i < limit; i++) { - d_input_data[d_input_size++] = bake_data->data(i); - d_input_data[d_input_size++] = bake_data->differentials(i); + this->num_parts += device->get_split_task_count(task); } - if(d_input_size == 0) - return false; + this->num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1; - /* run device task */ - device_vector<float4> d_output; - d_output.resize(limit); + for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) { + size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit); - /* needs to be up to data for attribute access */ - device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); + /* setup input for device task */ + device_vector<uint4> d_input; + uint4 *d_input_data = d_input.resize(shader_size * 2); + size_t d_input_size = 0; - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + for(size_t i = shader_offset; i < (shader_offset + shader_size); i++) { + d_input_data[d_input_size++] = bake_data->data(i); + d_input_data[d_input_size++] = bake_data->differentials(i); + } - DeviceTask task(DeviceTask::SHADER); - task.shader_input = d_input.device_pointer; - task.shader_output = d_output.device_pointer; - task.shader_eval_type = shader_type; - task.shader_x = 0; - task.shader_w = d_output.size(); - task.get_cancel = function_bind(&Progress::get_cancel, &progress); + if(d_input_size == 0) { + m_is_baking = false; + return false; + } - device->task_add(task); - device->task_wait(); + /* run device task */ + device_vector<float4> d_output; + d_output.resize(shader_size); + + /* needs to be up to data for attribute access */ + device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); + + device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_copy_to(d_input); + device->mem_alloc(d_output, MEM_WRITE_ONLY); + + DeviceTask task(DeviceTask::SHADER); + task.shader_input = d_input.device_pointer; + task.shader_output = d_output.device_pointer; + task.shader_eval_type = shader_type; + task.shader_x = 0; + task.offset = shader_offset; + task.shader_w = d_output.size(); + task.num_samples = this->num_samples; + task.get_cancel = function_bind(&Progress::get_cancel, &progress); + task.update_progress_sample = function_bind(&Progress::increment_sample_update, &progress); + + device->task_add(task); + device->task_wait(); + + if(progress.get_cancel()) { + device->mem_free(d_input); + device->mem_free(d_output); + m_is_baking = false; + return false; + } - if(progress.get_cancel()) { + device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); device->mem_free(d_input); device->mem_free(d_output); - m_is_baking = false; - return false; - } - device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); - device->mem_free(d_input); - device->mem_free(d_output); + /* read result */ + int k = 0; - /* read result */ - int k = 0; + float4 *offset = (float4*)d_output.data_pointer; - float4 *offset = (float4*)d_output.data_pointer; + size_t depth = 4; + for(size_t i=shader_offset; i < (shader_offset + shader_size); i++) { + size_t index = i * depth; + float4 out = offset[k++]; - size_t depth = 4; - for(size_t i = 0; i < limit; i++) { - size_t index = i * depth; - float4 out = offset[k++]; - - if(bake_data->is_valid(i)) { - for(size_t j=0; j < 4; j++) { - result[index + j] = out[j]; + if(bake_data->is_valid(i)) { + for(size_t j=0; j < 4; j++) { + result[index + j] = out[j]; + } } } } @@ -203,4 +235,35 @@ void BakeManager::device_free(Device *device, DeviceScene *dscene) { } +bool BakeManager::is_aa_pass(ShaderEvalType type) +{ + switch(type) { + case SHADER_EVAL_UV: + case SHADER_EVAL_NORMAL: + return false; + default: + return true; + } +} + +bool BakeManager::is_light_pass(ShaderEvalType type) +{ + switch(type) { + case SHADER_EVAL_AO: + case SHADER_EVAL_COMBINED: + case SHADER_EVAL_SHADOW: + case SHADER_EVAL_DIFFUSE_DIRECT: + case SHADER_EVAL_GLOSSY_DIRECT: + case SHADER_EVAL_TRANSMISSION_DIRECT: + case SHADER_EVAL_SUBSURFACE_DIRECT: + case SHADER_EVAL_DIFFUSE_INDIRECT: + case SHADER_EVAL_GLOSSY_INDIRECT: + case SHADER_EVAL_TRANSMISSION_INDIRECT: + case SHADER_EVAL_SUBSURFACE_INDIRECT: + return true; + default: + return false; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h index ea403f7d39a..186fbbeea4d 100644 --- a/intern/cycles/render/bake.h +++ b/intern/cycles/render/bake.h @@ -17,29 +17,30 @@ #ifndef __BAKE_H__ #define __BAKE_H__ -#include "util_vector.h" #include "device.h" #include "scene.h" -#include "session.h" + +#include "util_progress.h" +#include "util_vector.h" CCL_NAMESPACE_BEGIN class BakeData { public: - BakeData(const int object, const int tri_offset, const int num_pixels); + BakeData(const int object, const size_t tri_offset, const size_t num_pixels); ~BakeData(); void set(int i, int prim, float uv[2], float dudx, float dudy, float dvdx, float dvdy); int object(); - int size(); + size_t size(); uint4 data(int i); uint4 differentials(int i); bool is_valid(int i); private: int m_object; - int m_tri_offset; - int m_num_pixels; + size_t m_tri_offset; + size_t m_num_pixels; vector<int>m_primitive; vector<float>m_u; vector<float>m_v; @@ -57,18 +58,27 @@ public: bool get_baking(); void set_baking(const bool value); - BakeData *init(const int object, const int tri_offset, const int num_pixels); + BakeData *init(const int object, const size_t tri_offset, const size_t num_pixels); + + void set_shader_limit(const size_t x, const size_t y); bool bake(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress, ShaderEvalType shader_type, BakeData *bake_data, float result[]); void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); void device_free(Device *device, DeviceScene *dscene); + static bool is_light_pass(ShaderEvalType type); + static bool is_aa_pass(ShaderEvalType type); + bool need_update; + int num_samples; + int num_parts; + private: BakeData *m_bake_data; bool m_is_baking; + size_t m_shader_limit; }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/blackbody.cpp b/intern/cycles/render/blackbody.cpp index 89af714e8ec..6e2cb7c62b6 100644 --- a/intern/cycles/render/blackbody.cpp +++ b/intern/cycles/render/blackbody.cpp @@ -100,7 +100,7 @@ vector<float> blackbody_table() /* ToDo: bring this back to what OSL does with the lastTemperature limit ? */ for (int i = 0; i <= 317; ++i) { - double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPPER; + double Temperature = pow((double)i, (double)BB_TABLE_XPOWER) * (double)BB_TABLE_SPACING + (double)BB_DRAPER; X = 0; Y = 0; Z = 0; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fc65922fc87..756e16b38b5 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -190,6 +190,14 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int pixels[0] = clamp(f*scale_exposure, 0.0f, 1.0f); } } +#ifdef WITH_CYCLES_DEBUG + else if(type == PASS_BVH_TRAVERSAL_STEPS) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + float f = *in; + pixels[0] = f; + } + } +#endif else { for(int i = 0; i < size; i++, in += pass_stride, pixels++) { float f = *in; diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index 8659fe4f7a3..110adb4d036 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -15,10 +15,13 @@ */ #include "camera.h" +#include "mesh.h" +#include "object.h" #include "scene.h" #include "device.h" +#include "util_foreach.h" #include "util_vector.h" CCL_NAMESPACE_BEGIN @@ -38,6 +41,8 @@ Camera::Camera() motion.post = transform_identity(); use_motion = false; + aperture_ratio = 1.0f; + type = CAMERA_PERSPECTIVE; panorama_type = PANORAMA_EQUIRECTANGULAR; fisheye_fov = M_PI_F; @@ -241,6 +246,9 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) /* type */ kcam->type = type; + /* anamorphic lens bokeh */ + kcam->inv_aperture_ratio = 1.0f / aperture_ratio; + /* panorama */ kcam->panorama_type = panorama_type; kcam->fisheye_fov = fisheye_fov; @@ -265,6 +273,20 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) need_device_update = false; previous_need_motion = need_motion; + + /* Camera in volume. */ + kcam->is_inside_volume = 0; + BoundBox viewplane_boundbox = viewplane_bounds_get(); + for(size_t i = 0; i < scene->objects.size(); ++i) { + Object *object = scene->objects[i]; + if(object->mesh->has_volume && + viewplane_boundbox.intersects(object->bounds)) + { + /* TODO(sergey): Consider adding more grained check. */ + kcam->is_inside_volume = 1; + break; + } + } } void Camera::device_free(Device *device, DeviceScene *dscene) @@ -291,6 +313,7 @@ bool Camera::modified(const Camera& cam) (viewplane == cam.viewplane) && (border == cam.border) && (matrix == cam.matrix) && + (aperture_ratio == cam.aperture_ratio) && (panorama_type == cam.panorama_type) && (fisheye_fov == cam.fisheye_fov) && (fisheye_lens == cam.fisheye_lens)); @@ -307,5 +330,62 @@ void Camera::tag_update() need_update = true; } +float3 Camera::transform_raster_to_world(float raster_x, float raster_y) +{ + float3 D, P; + if(type == CAMERA_PERSPECTIVE) { + D = transform_perspective(&rastertocamera, + make_float3(raster_x, raster_y, 0.0f)); + P = make_float3(0.0f, 0.0f, 0.0f); + /* TODO(sergey): Aperture support? */ + P = transform_point(&cameratoworld, P); + D = normalize(transform_direction(&cameratoworld, D)); + /* TODO(sergey): Clipping is conditional in kernel, and hence it could + * be mistakes in here, currently leading to wrong camera-in-volume + * detection. + */ + P += nearclip * D; + } + else if (type == CAMERA_ORTHOGRAPHIC) { + D = make_float3(0.0f, 0.0f, 1.0f); + /* TODO(sergey): Aperture support? */ + P = transform_perspective(&rastertocamera, + make_float3(raster_x, raster_y, 0.0f)); + P = transform_point(&cameratoworld, P); + D = normalize(transform_direction(&cameratoworld, D)); + } + else { + assert(!"unsupported camera type"); + } + return P; +} + +BoundBox Camera::viewplane_bounds_get() +{ + /* TODO(sergey): This is all rather stupid, but is there a way to perform + * checks we need in a more clear and smart fasion? + */ + BoundBox bounds = BoundBox::empty; + + if(type == CAMERA_PANORAMA) { + bounds.grow(make_float3(cameratoworld.w.x, + cameratoworld.w.y, + cameratoworld.w.z)); + } + else { + bounds.grow(transform_raster_to_world(0.0f, 0.0f)); + bounds.grow(transform_raster_to_world(0.0f, (float)height)); + bounds.grow(transform_raster_to_world((float)width, (float)height)); + bounds.grow(transform_raster_to_world((float)width, 0.0f)); + if(type == CAMERA_PERSPECTIVE) { + /* Center point has the most distancei in local Z axis, + * use it to construct bounding box/ + */ + bounds.grow(transform_raster_to_world(0.5f*width, 0.5f*height)); + } + } + return bounds; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index c28670bc55f..788ae7b9bb6 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -54,6 +54,9 @@ public: float fisheye_fov; float fisheye_lens; + /* anamorphic lens bokeh */ + float aperture_ratio; + /* sensor */ float sensorwidth; float sensorheight; @@ -113,6 +116,9 @@ public: bool modified(const Camera& cam); bool motion_modified(const Camera& cam); void tag_update(); + + BoundBox viewplane_bounds_get(); + float3 transform_raster_to_world(float raster_x, float raster_y); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp index 2c96ffa655e..dc7665fe144 100644 --- a/intern/cycles/render/curves.cpp +++ b/intern/cycles/render/curves.cpp @@ -46,8 +46,9 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim) float discroot = curve_coef[2] * curve_coef[2] - 3 * curve_coef[3] * curve_coef[1]; float ta = -1.0f; float tb = -1.0f; + if(discroot >= 0) { - discroot = sqrt(discroot); + discroot = sqrtf(discroot); ta = (-curve_coef[2] - discroot) / (3 * curve_coef[3]); tb = (-curve_coef[2] + discroot) / (3 * curve_coef[3]); ta = (ta > 1.0f || ta < 0.0f) ? -1.0f : ta; @@ -56,20 +57,21 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim) *upper = max(p1[dim],p2[dim]); *lower = min(p1[dim],p2[dim]); + float exa = p1[dim]; float exb = p2[dim]; - float t2; - float t3; + if(ta >= 0.0f) { - t2 = ta * ta; - t3 = t2 * ta; + float t2 = ta * ta; + float t3 = t2 * ta; exa = curve_coef[3] * t3 + curve_coef[2] * t2 + curve_coef[1] * ta + curve_coef[0]; } if(tb >= 0.0f) { - t2 = tb * tb; - t3 = t2 * tb; + float t2 = tb * tb; + float t3 = t2 * tb; exb = curve_coef[3] * t3 + curve_coef[2] * t2 + curve_coef[1] * tb + curve_coef[0]; } + *upper = max(*upper, max(exa,exb)); *lower = min(*lower, min(exa,exb)); } diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index c1aefbcfbbc..19f959d4ea1 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -80,22 +80,13 @@ void Pass::add(PassType type, vector<Pass>& passes) pass.components = 1; break; case PASS_OBJECT_ID: - pass.components = 1; - pass.filter = false; - break; case PASS_MATERIAL_ID: pass.components = 1; pass.filter = false; break; case PASS_DIFFUSE_COLOR: - pass.components = 4; - break; case PASS_GLOSSY_COLOR: - pass.components = 4; - break; case PASS_TRANSMISSION_COLOR: - pass.components = 4; - break; case PASS_SUBSURFACE_COLOR: pass.components = 4; break; @@ -141,9 +132,6 @@ void Pass::add(PassType type, vector<Pass>& passes) break; case PASS_EMISSION: - pass.components = 4; - pass.exposure = true; - break; case PASS_BACKGROUND: pass.components = 4; pass.exposure = true; @@ -158,6 +146,12 @@ void Pass::add(PassType type, vector<Pass>& passes) case PASS_LIGHT: /* ignores */ break; +#ifdef WITH_CYCLES_DEBUG + case PASS_BVH_TRAVERSAL_STEPS: + pass.components = 1; + pass.exposure = false; + break; +#endif } passes.push_back(pass); @@ -400,6 +394,13 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) case PASS_LIGHT: kfilm->use_light_pass = 1; break; + +#ifdef WITH_CYCLES_DEBUG + case PASS_BVH_TRAVERSAL_STEPS: + kfilm->pass_bvh_traversal_steps = kfilm->pass_stride; + break; +#endif + case PASS_NONE: break; } diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp index 0ff904d06e7..45b08832fea 100644 --- a/intern/cycles/render/graph.cpp +++ b/intern/cycles/render/graph.cpp @@ -320,20 +320,20 @@ void ShaderGraph::remove_unneeded_nodes() { vector<bool> removed(num_node_ids, false); bool any_node_removed = false; - + /* find and unlink proxy nodes */ foreach(ShaderNode *node, nodes) { if(node->special_type == SHADER_SPECIAL_TYPE_PROXY) { ProxyNode *proxy = static_cast<ProxyNode*>(node); ShaderInput *input = proxy->inputs[0]; ShaderOutput *output = proxy->outputs[0]; - + /* temp. copy of the output links list. * output->links is modified when we disconnect! */ vector<ShaderInput*> links(output->links); ShaderOutput *from = input->link; - + /* bypass the proxy node */ if(from) { disconnect(input); @@ -391,6 +391,8 @@ void ShaderGraph::remove_unneeded_nodes() if(output) connect(output, input); } + removed[mix->id] = true; + any_node_removed = true; } /* remove unused mix closure input when factor is 0.0 or 1.0 */ @@ -400,7 +402,7 @@ void ShaderGraph::remove_unneeded_nodes() if(mix->inputs[0]->value.x == 0.0f) { ShaderOutput *output = mix->inputs[1]->link; vector<ShaderInput*> inputs = mix->outputs[0]->links; - + foreach(ShaderInput *sock, mix->inputs) if(sock->link) disconnect(sock); @@ -410,6 +412,8 @@ void ShaderGraph::remove_unneeded_nodes() if(output) connect(output, input); } + removed[mix->id] = true; + any_node_removed = true; } /* factor 1.0 */ else if(mix->inputs[0]->value.x == 1.0f) { @@ -425,13 +429,57 @@ void ShaderGraph::remove_unneeded_nodes() if(output) connect(output, input); } + removed[mix->id] = true; + any_node_removed = true; + } + } + } + else if(node->special_type == SHADER_SPECIAL_TYPE_MIX_RGB) { + MixNode *mix = static_cast<MixNode*>(node); + + /* remove unused Mix RGB inputs when factor is 0.0 or 1.0 */ + /* check for color links and make sure factor link is disconnected */ + if(mix->outputs[0]->links.size() && mix->inputs[1]->link && mix->inputs[2]->link && !mix->inputs[0]->link) { + /* factor 0.0 */ + if(mix->inputs[0]->value.x == 0.0f) { + ShaderOutput *output = mix->inputs[1]->link; + vector<ShaderInput*> inputs = mix->outputs[0]->links; + + foreach(ShaderInput *sock, mix->inputs) + if(sock->link) + disconnect(sock); + + foreach(ShaderInput *input, inputs) { + disconnect(input); + if(output) + connect(output, input); + } + removed[mix->id] = true; + any_node_removed = true; + } + /* factor 1.0 */ + else if(mix->inputs[0]->value.x == 1.0f) { + ShaderOutput *output = mix->inputs[2]->link; + vector<ShaderInput*> inputs = mix->outputs[0]->links; + + foreach(ShaderInput *sock, mix->inputs) + if(sock->link) + disconnect(sock); + + foreach(ShaderInput *input, inputs) { + disconnect(input); + if(output) + connect(output, input); + } + removed[mix->id] = true; + any_node_removed = true; } } } } /* remove nodes */ - if (any_node_removed) { + if(any_node_removed) { list<ShaderNode*> newnodes; foreach(ShaderNode *node, nodes) { @@ -787,5 +835,47 @@ void ShaderGraph::transform_multi_closure(ShaderNode *node, ShaderOutput *weight } } +void ShaderGraph::dump_graph(const char *filename) +{ + FILE *fd = fopen(filename, "w"); + + if(fd == NULL) { + printf("Error opening file for dumping the graph: %s\n", filename); + return; + } + + fprintf(fd, "digraph dependencygraph {\n"); + fprintf(fd, "ranksep=1.5\n"); + fprintf(fd, "splines=false\n"); + + foreach(ShaderNode *node, nodes) { + fprintf(fd, "// NODE: %p\n", node); + fprintf(fd, + "\"%p\" [shape=record,label=\"%s\"]\n", + node, + node->name.c_str()); + } + + foreach(ShaderNode *node, nodes) { + foreach(ShaderOutput *output, node->outputs) { + foreach(ShaderInput *input, output->links) { + fprintf(fd, + "// CONNECTION: %p->%p (%s:%s)\n", + output, + input, + output->name, input->name); + fprintf(fd, + "\"%p\":s -> \"%p\":n [label=\"%s:%s\"]\n", + output->parent, + input->parent, + output->name, input->name); + } + } + } + + fprintf(fd, "}\n"); + fclose(fd); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 89a066195d6..7b95703d3aa 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -76,6 +76,7 @@ enum ShaderNodeSpecialType { SHADER_SPECIAL_TYPE_NONE, SHADER_SPECIAL_TYPE_PROXY, SHADER_SPECIAL_TYPE_MIX_CLOSURE, + SHADER_SPECIAL_TYPE_MIX_RGB, /* Only Mix subtype */ SHADER_SPECIAL_TYPE_AUTOCONVERT, SHADER_SPECIAL_TYPE_GEOMETRY, SHADER_SPECIAL_TYPE_SCRIPT @@ -249,6 +250,8 @@ public: void remove_unneeded_nodes(); void finalize(bool do_bump = false, bool do_osl = false); + void dump_graph(const char *filename); + protected: typedef pair<ShaderNode* const, ShaderNode*> NodePair; diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 86755badc42..eb2c3333c44 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -135,6 +135,7 @@ bool ImageManager::is_float_image(const string& filename, void *builtin_data, bo (colorspace == "" && (strcmp(in->format_name(), "png") == 0 || strcmp(in->format_name(), "tiff") == 0 || + strcmp(in->format_name(), "dpx") == 0 || strcmp(in->format_name(), "jpeg2000") == 0))); } else { @@ -157,7 +158,8 @@ static bool image_equals(ImageManager::Image *image, const string& filename, voi image->interpolation == interpolation; } -int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha) +int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, float frame, + bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha) { Image *img; size_t slot; @@ -168,8 +170,17 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani if(is_float) { /* find existing image */ for(slot = 0; slot < float_images.size(); slot++) { - if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) { - float_images[slot]->users++; + img = float_images[slot]; + if(img && image_equals(img, filename, builtin_data, interpolation)) { + if(img->frame != frame) { + img->frame = frame; + img->need_load = true; + } + if(img->use_alpha != use_alpha) { + img->use_alpha = use_alpha; + img->need_load = true; + } + img->users++; return slot; } } @@ -197,6 +208,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani img->builtin_data = builtin_data; img->need_load = true; img->animated = animated; + img->frame = frame; img->interpolation = interpolation; img->users = 1; img->use_alpha = use_alpha; @@ -205,8 +217,17 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani } else { for(slot = 0; slot < images.size(); slot++) { - if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) { - images[slot]->users++; + img = images[slot]; + if(img && image_equals(img, filename, builtin_data, interpolation)) { + if(img->frame != frame) { + img->frame = frame; + img->need_load = true; + } + if(img->use_alpha != use_alpha) { + img->use_alpha = use_alpha; + img->need_load = true; + } + img->users++; return slot+tex_image_byte_start; } } @@ -234,6 +255,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani img->builtin_data = builtin_data; img->need_load = true; img->animated = animated; + img->frame = frame; img->interpolation = interpolation; img->users = 1; img->use_alpha = use_alpha; @@ -242,6 +264,7 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool ani slot += tex_image_byte_start; } + need_update = true; return slot; @@ -299,6 +322,32 @@ void ImageManager::remove_image(const string& filename, void *builtin_data, Inte } } +/* TODO(sergey): Deduplicate with the iteration above, but make it pretty, + * without bunch of arguments passing around making code readability even + * more cluttered. + */ +void ImageManager::tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation) +{ + size_t slot; + + for(slot = 0; slot < images.size(); slot++) { + if(images[slot] && image_equals(images[slot], filename, builtin_data, interpolation)) { + images[slot]->need_load = true; + break; + } + } + + if(slot == images.size()) { + /* see if it's in a float texture slot */ + for(slot = 0; slot < float_images.size(); slot++) { + if(float_images[slot] && image_equals(float_images[slot], filename, builtin_data, interpolation)) { + float_images[slot]->need_load = true; + break; + } + } + } +} + bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) { if(img->filename == "") @@ -351,6 +400,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) /* read RGBA pixels */ uchar *pixels = (uchar*)tex_img.resize(width, height, depth); + bool cmyk = false; if(in) { if(depth <= 1) { @@ -366,6 +416,8 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) in->read_image(TypeDesc::UINT8, (uchar*)pixels); } + cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4; + in->close(); delete in; } @@ -373,7 +425,17 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) builtin_image_pixels_cb(img->filename, img->builtin_data, pixels); } - if(components == 2) { + if(cmyk) { + /* CMYK */ + for(int i = width*height*depth-1; i >= 0; i--) { + pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255; + pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255; + pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255; + pixels[i*4+3] = 255; + } + } + else if(components == 2) { + /* grayscale + alpha */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = pixels[i*2+1]; pixels[i*4+2] = pixels[i*2+0]; @@ -382,6 +444,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) } } else if(components == 3) { + /* RGB */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = 255; pixels[i*4+2] = pixels[i*3+2]; @@ -390,6 +453,7 @@ bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) } } else if(components == 1) { + /* grayscale */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = 255; pixels[i*4+2] = pixels[i]; @@ -448,7 +512,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_ builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components); } - if(!(components >= 1 && components <= 4)) { + if(components < 1 || width == 0 || height == 0) { if(in) { in->close(); delete in; @@ -458,21 +522,43 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_ /* read RGBA pixels */ float *pixels = (float*)tex_img.resize(width, height, depth); + bool cmyk = false; if(in) { + float *readpixels = pixels; + vector<float> tmppixels; + + if(components > 4) { + tmppixels.resize(width*height*components); + readpixels = &tmppixels[0]; + } + if(depth <= 1) { int scanlinesize = width*components*sizeof(float); in->read_image(TypeDesc::FLOAT, - (uchar*)pixels + (height-1)*scanlinesize, + (uchar*)readpixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); } else { - in->read_image(TypeDesc::FLOAT, (uchar*)pixels); + in->read_image(TypeDesc::FLOAT, (uchar*)readpixels); + } + + if(components > 4) { + for(int i = width*height-1; i >= 0; i--) { + pixels[i*4+3] = tmppixels[i*components+3]; + pixels[i*4+2] = tmppixels[i*components+2]; + pixels[i*4+1] = tmppixels[i*components+1]; + pixels[i*4+0] = tmppixels[i*components+0]; + } + + tmppixels.clear(); } + cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4; + in->close(); delete in; } @@ -480,7 +566,17 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_ builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels); } - if(components == 2) { + if(cmyk) { + /* CMYK */ + for(int i = width*height*depth-1; i >= 0; i--) { + pixels[i*4+3] = 255; + pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255; + pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255; + pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255; + } + } + else if(components == 2) { + /* grayscale + alpha */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = pixels[i*2+1]; pixels[i*4+2] = pixels[i*2+0]; @@ -489,6 +585,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_ } } else if(components == 3) { + /* RGB */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i*3+2]; @@ -497,6 +594,7 @@ bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_ } } else if(components == 1) { + /* grayscale */ for(int i = width*height*depth-1; i >= 0; i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i]; @@ -557,7 +655,8 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl string name; - if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot); + if(slot >= 100) name = string_printf("__tex_image_float_%d", slot); + else if(slot >= 10) name = string_printf("__tex_image_float_0%d", slot); else name = string_printf("__tex_image_float_00%d", slot); if(!pack_images) { @@ -588,7 +687,8 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, int sl string name; - if(slot >= 10) name = string_printf("__tex_image_0%d", slot); + if(slot >= 100) name = string_printf("__tex_image_%d", slot); + else if(slot >= 10) name = string_printf("__tex_image_0%d", slot); else name = string_printf("__tex_image_00%d", slot); if(!pack_images) { @@ -744,6 +844,17 @@ void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progr } } +void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene) +{ + for(size_t slot = 0; slot < images.size(); slot++) + if(images[slot] && images[slot]->builtin_data) + device_free_image(device, dscene, slot + tex_image_byte_start); + + for(size_t slot = 0; slot < float_images.size(); slot++) + if(float_images[slot] && float_images[slot]->builtin_data) + device_free_image(device, dscene, slot); +} + void ImageManager::device_free(Device *device, DeviceScene *dscene) { for(size_t slot = 0; slot < images.size(); slot++) diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 561550fe0d2..535f0ff156d 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* generic */ -#define TEX_NUM_IMAGES 95 +#define TEX_NUM_IMAGES 94 #define TEX_IMAGE_BYTE_START TEX_NUM_FLOAT_IMAGES /* extended gpu */ @@ -55,13 +55,16 @@ public: ImageManager(); ~ImageManager(); - int add_image(const string& filename, void *builtin_data, bool animated, bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha); + int add_image(const string& filename, void *builtin_data, bool animated, float frame, + bool& is_float, bool& is_linear, InterpolationType interpolation, bool use_alpha); void remove_image(int slot); void remove_image(const string& filename, void *builtin_data, InterpolationType interpolation); + void tag_reload_image(const string& filename, void *builtin_data, InterpolationType interpolation); bool is_float_image(const string& filename, void *builtin_data, bool& is_linear); void device_update(Device *device, DeviceScene *dscene, Progress& progress); void device_free(Device *device, DeviceScene *dscene); + void device_free_builtin(Device *device, DeviceScene *dscene); void set_osl_texture_system(void *texture_system); void set_pack_images(bool pack_images_); @@ -81,6 +84,7 @@ public: bool use_alpha; bool need_load; bool animated; + float frame; InterpolationType interpolation; int users; diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 59a0de07e5a..03a8cd5d2d3 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -43,7 +43,8 @@ Integrator::Integrator() volume_max_steps = 1024; volume_step_size = 0.1f; - no_caustics = false; + caustics_reflective = true; + caustics_refractive = true; filter_glossy = 0.0f; seed = 0; layer_flag = ~0; @@ -86,22 +87,33 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1; kintegrator->max_glossy_bounce = max_glossy_bounce + 1; kintegrator->max_transmission_bounce = max_transmission_bounce + 1; - - if(kintegrator->use_volumes) - kintegrator->max_volume_bounce = max_volume_bounce + 1; - else - kintegrator->max_volume_bounce = 1; + kintegrator->max_volume_bounce = max_volume_bounce + 1; kintegrator->transparent_max_bounce = transparent_max_bounce + 1; kintegrator->transparent_min_bounce = transparent_min_bounce + 1; - kintegrator->transparent_shadows = transparent_shadows; + /* Transparent Shadows + * We only need to enable transparent shadows, if we actually have + * transparent shaders in the scene. Otherwise we can disable it + * to improve performance a bit. */ + if(transparent_shadows) { + foreach(Shader *shader, scene->shaders) { + /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ + if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { + kintegrator->transparent_shadows = true; + break; + } + } + } + else { + kintegrator->transparent_shadows = false; + } - kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling; kintegrator->volume_max_steps = volume_max_steps; kintegrator->volume_step_size = volume_step_size; - kintegrator->no_caustics = no_caustics; + kintegrator->caustics_reflective = caustics_reflective; + kintegrator->caustics_refractive = caustics_refractive; kintegrator->filter_glossy = (filter_glossy == 0.0f)? FLT_MAX: 1.0f/filter_glossy; kintegrator->seed = hash_int(seed); @@ -121,8 +133,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->mesh_light_samples = mesh_light_samples; kintegrator->subsurface_samples = subsurface_samples; kintegrator->volume_samples = volume_samples; - kintegrator->sample_all_lights_direct = sample_all_lights_direct; - kintegrator->sample_all_lights_indirect = sample_all_lights_indirect; + + if(method == BRANCHED_PATH) { + kintegrator->sample_all_lights_direct = sample_all_lights_direct; + kintegrator->sample_all_lights_indirect = sample_all_lights_indirect; + } + else { + kintegrator->sample_all_lights_direct = false; + kintegrator->sample_all_lights_indirect = false; + } kintegrator->sampling_pattern = sampling_pattern; kintegrator->aa_samples = aa_samples; @@ -173,7 +192,8 @@ bool Integrator::modified(const Integrator& integrator) volume_homogeneous_sampling == integrator.volume_homogeneous_sampling && volume_max_steps == integrator.volume_max_steps && volume_step_size == integrator.volume_step_size && - no_caustics == integrator.no_caustics && + caustics_reflective == integrator.caustics_reflective && + caustics_refractive == integrator.caustics_refractive && filter_glossy == integrator.filter_glossy && layer_flag == integrator.layer_flag && seed == integrator.seed && diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 380c1a65722..13c10e8ca94 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -43,7 +43,8 @@ public: int volume_max_steps; float volume_step_size; - bool no_caustics; + bool caustics_reflective; + bool caustics_refractive; float filter_glossy; int seed; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 7bdb1fbf8af..1f006637e67 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -66,11 +66,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res main_task.shader_eval_type = SHADER_EVAL_BACKGROUND; main_task.shader_x = 0; main_task.shader_w = width*height; + main_task.num_samples = 1; main_task.get_cancel = function_bind(&Progress::get_cancel, &progress); /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */ list<DeviceTask> split_tasks; - main_task.split_max_size(split_tasks, 128*128); + main_task.split(split_tasks, 1, 128*128); foreach(DeviceTask& task, split_tasks) { device->task_add(task); @@ -120,6 +121,7 @@ Light::Light() use_diffuse = true; use_glossy = true; use_transmission = true; + use_scatter = true; shader = 0; samples = 1; @@ -205,8 +207,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen } /* skip motion blurred deforming meshes, not supported yet */ - if(mesh->has_motion_blur()) + if(mesh->has_motion_blur()) { + j++; continue; + } /* skip if we have no emission shaders */ foreach(uint sindex, mesh->used_shaders) { @@ -240,6 +244,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen shader_flag |= SHADER_EXCLUDE_TRANSMIT; use_light_visibility = true; } + if(!(object->visibility & PATH_RAY_VOLUME_SCATTER)) { + shader_flag |= SHADER_EXCLUDE_SCATTER; + use_light_visibility = true; + } for(size_t i = 0; i < mesh->triangles.size(); i++) { Shader *shader = scene->shaders[mesh->shader[i]]; @@ -497,6 +505,10 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce shader_id |= SHADER_EXCLUDE_TRANSMIT; use_light_visibility = true; } + if(!light->use_scatter) { + shader_id |= SHADER_EXCLUDE_SCATTER; + use_light_visibility = true; + } if(light->type == LIGHT_POINT) { shader_id &= ~SHADER_AREA_LIGHT; @@ -551,6 +563,10 @@ void LightManager::device_update_points(Device *device, DeviceScene *dscene, Sce shader_id |= SHADER_EXCLUDE_TRANSMIT; use_light_visibility = true; } + if(!(visibility & PATH_RAY_VOLUME_SCATTER)) { + shader_id |= SHADER_EXCLUDE_SCATTER; + use_light_visibility = true; + } light_data[i*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f); light_data[i*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h index 82308cf3e88..89091bb5f9e 100644 --- a/intern/cycles/render/light.h +++ b/intern/cycles/render/light.h @@ -54,6 +54,7 @@ public: bool use_diffuse; bool use_glossy; bool use_transmission; + bool use_scatter; int shader; int samples; diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 9c5ddd55010..6137f7d4fdc 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -93,6 +93,8 @@ Mesh::Mesh() attributes.triangle_mesh = this; curve_attributes.curve_mesh = this; + + has_volume = false; } Mesh::~Mesh() @@ -132,6 +134,7 @@ void Mesh::clear() transform_applied = false; transform_negative_scaled = false; transform_normal = transform_identity(); + geometry_synced = false; } int Mesh::split_vertex(int vertex) @@ -377,14 +380,12 @@ void Mesh::add_vertex_normals() } } -void Mesh::pack_normals(Scene *scene, float4 *normal, float4 *vnormal) +void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) { - Attribute *attr_fN = attributes.find(ATTR_STD_FACE_NORMAL); Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL); - float3 *fN = attr_fN->data_float3(); float3 *vN = attr_vN->data_float3(); - int shader_id = 0; + uint shader_id = 0; uint last_shader = -1; bool last_smooth = false; @@ -394,24 +395,15 @@ void Mesh::pack_normals(Scene *scene, float4 *normal, float4 *vnormal) bool do_transform = transform_applied; Transform ntfm = transform_normal; + /* save shader */ for(size_t i = 0; i < triangles_size; i++) { - float3 fNi = fN[i]; - - if(do_transform) - fNi = normalize(transform_direction(&ntfm, fNi)); - - normal[i].x = fNi.x; - normal[i].y = fNi.y; - normal[i].z = fNi.z; - - /* stuff shader id in here too */ if(shader_ptr[i] != last_shader || last_smooth != smooth[i]) { last_shader = shader_ptr[i]; last_smooth = smooth[i]; shader_id = scene->shader_manager->get_shader_id(last_shader, this, last_smooth); } - normal[i].w = __int_as_float(shader_id); + tri_shader[i] = shader_id; } size_t verts_size = verts.size(); @@ -756,7 +748,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce device->tex_alloc("__attributes_map", dscene->attributes_map); } -static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_float, vector<float4>& attr_float3, +static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_float, vector<float4>& attr_float3, vector<uchar4>& attr_uchar4, Attribute *mattr, TypeDesc& type, int& offset, AttributeElement& element) { if(mattr) { @@ -777,6 +769,15 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa VoxelAttribute *voxel_data = mattr->data_voxel(); offset = voxel_data->slot; } + else if(mattr->element == ATTR_ELEMENT_CORNER_BYTE) { + uchar4 *data = mattr->data_uchar4(); + offset = attr_uchar4.size(); + + attr_uchar4.resize(attr_uchar4.size() + size); + + for(size_t k = 0; k < size; k++) + attr_uchar4[offset+k] = data[k]; + } else if(mattr->type == TypeDesc::TypeFloat) { float *data = mattr->data_float(); offset = attr_float.size(); @@ -813,7 +814,7 @@ static void update_attribute_element_offset(Mesh *mesh, vector<float>& attr_floa offset -= mesh->vert_offset; else if(element == ATTR_ELEMENT_FACE) offset -= mesh->tri_offset; - else if(element == ATTR_ELEMENT_CORNER) + else if(element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) offset -= 3*mesh->tri_offset; else if(element == ATTR_ELEMENT_CURVE) offset -= mesh->curve_offset; @@ -854,6 +855,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, * maps next */ vector<float> attr_float; vector<float4> attr_float3; + vector<uchar4> attr_uchar4; for(size_t i = 0; i < scene->meshes.size(); i++) { Mesh *mesh = scene->meshes[i]; @@ -874,10 +876,10 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, memcpy(triangle_mattr->data_float3(), &mesh->verts[0], sizeof(float3)*mesh->verts.size()); } - update_attribute_element_offset(mesh, attr_float, attr_float3, triangle_mattr, + update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, triangle_mattr, req.triangle_type, req.triangle_offset, req.triangle_element); - update_attribute_element_offset(mesh, attr_float, attr_float3, curve_mattr, + update_attribute_element_offset(mesh, attr_float, attr_float3, attr_uchar4, curve_mattr, req.curve_type, req.curve_offset, req.curve_element); if(progress.get_cancel()) return; @@ -903,6 +905,10 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, dscene->attributes_float3.copy(&attr_float3[0], attr_float3.size()); device->tex_alloc("__attributes_float3", dscene->attributes_float3); } + if(attr_uchar4.size()) { + dscene->attributes_uchar4.copy(&attr_uchar4[0], attr_uchar4.size()); + device->tex_alloc("__attributes_uchar4", dscene->attributes_uchar4); + } } void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) @@ -932,13 +938,13 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene /* normals */ progress.set_status("Updating Mesh", "Computing normals"); - float4 *normal = dscene->tri_normal.resize(tri_size); + uint *tri_shader = dscene->tri_shader.resize(tri_size); float4 *vnormal = dscene->tri_vnormal.resize(vert_size); float4 *tri_verts = dscene->tri_verts.resize(vert_size); float4 *tri_vindex = dscene->tri_vindex.resize(tri_size); foreach(Mesh *mesh, scene->meshes) { - mesh->pack_normals(scene, &normal[mesh->tri_offset], &vnormal[mesh->vert_offset]); + mesh->pack_normals(scene, &tri_shader[mesh->tri_offset], &vnormal[mesh->vert_offset]); mesh->pack_verts(&tri_verts[mesh->vert_offset], &tri_vindex[mesh->tri_offset], mesh->vert_offset); if(progress.get_cancel()) return; @@ -947,7 +953,7 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene /* vertex coordinates */ progress.set_status("Updating Mesh", "Copying Mesh to device"); - device->tex_alloc("__tri_normal", dscene->tri_normal); + device->tex_alloc("__tri_shader", dscene->tri_shader); device->tex_alloc("__tri_vnormal", dscene->tri_vnormal); device->tex_alloc("__tri_verts", dscene->tri_verts); device->tex_alloc("__tri_vindex", dscene->tri_vindex); @@ -1028,11 +1034,16 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen if(!need_update) return; - /* update normals */ + /* update normals and flags */ foreach(Mesh *mesh, scene->meshes) { - foreach(uint shader, mesh->used_shaders) + mesh->has_volume = false; + foreach(uint shader, mesh->used_shaders) { if(scene->shaders[shader]->need_update_attributes) mesh->need_update = true; + if(scene->shaders[shader]->has_volume) { + mesh->has_volume = true; + } + } if(mesh->need_update) { mesh->add_face_normals(); @@ -1100,6 +1111,8 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen bool motion_blur = false; #endif + /* update obejcts */ + vector<Object *> volume_objects; foreach(Object *object, scene->objects) object->compute_bounds(motion_blur); @@ -1119,7 +1132,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) device->tex_free(dscene->prim_visibility); device->tex_free(dscene->prim_index); device->tex_free(dscene->prim_object); - device->tex_free(dscene->tri_normal); + device->tex_free(dscene->tri_shader); device->tex_free(dscene->tri_vnormal); device->tex_free(dscene->tri_vindex); device->tex_free(dscene->tri_verts); @@ -1128,6 +1141,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) device->tex_free(dscene->attributes_map); device->tex_free(dscene->attributes_float); device->tex_free(dscene->attributes_float3); + device->tex_free(dscene->attributes_uchar4); dscene->bvh_nodes.clear(); dscene->object_node.clear(); @@ -1136,7 +1150,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) dscene->prim_visibility.clear(); dscene->prim_index.clear(); dscene->prim_object.clear(); - dscene->tri_normal.clear(); + dscene->tri_shader.clear(); dscene->tri_vnormal.clear(); dscene->tri_vindex.clear(); dscene->tri_verts.clear(); @@ -1145,6 +1159,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) dscene->attributes_map.clear(); dscene->attributes_float.clear(); dscene->attributes_float3.clear(); + dscene->attributes_uchar4.clear(); #ifdef WITH_OSL OSLGlobals *og = (OSLGlobals*)device->osl_memory(); diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index 247e3dd555e..7e34b761faf 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -71,11 +71,16 @@ public: ustring name; /* Mesh Data */ + bool geometry_synced; /* used to distinguish meshes with no verts + and meshed for which geometry is not created */ + vector<float3> verts; vector<Triangle> triangles; vector<uint> shader; vector<bool> smooth; + bool has_volume; /* Set in the device_update(). */ + vector<float4> curve_keys; /* co + radius */ vector<Curve> curves; @@ -120,7 +125,7 @@ public: void add_face_normals(); void add_vertex_normals(); - void pack_normals(Scene *scene, float4 *normal, float4 *vnormal); + void pack_normals(Scene *scene, uint *shader, float4 *vnormal); void pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset); void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset); void compute_bvh(SceneParams *params, Progress *progress, int n, int total); diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index 661fd9c66c1..4c0ee76299c 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -119,6 +119,7 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me task.shader_eval_type = SHADER_EVAL_DISPLACE; task.shader_x = 0; task.shader_w = d_output.size(); + task.num_samples = 1; task.get_cancel = function_bind(&Progress::get_cancel, &progress); device->task_add(task); diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index a53e0b39435..e8476bfac4c 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -243,7 +243,9 @@ void ImageTextureNode::compile(SVMCompiler& compiler) image_manager = compiler.image_manager; if(is_float == -1) { bool is_float_bool; - slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, interpolation, use_alpha); + slot = image_manager->add_image(filename, builtin_data, + animated, 0, is_float_bool, is_linear, + interpolation, use_alpha); is_float = (int)is_float_bool; } @@ -305,10 +307,32 @@ void ImageTextureNode::compile(OSLCompiler& compiler) tex_mapping.compile(compiler); - if(is_float == -1) - is_float = (int)image_manager->is_float_image(filename, NULL, is_linear); + image_manager = compiler.image_manager; + if(is_float == -1) { + if(builtin_data == NULL) { + is_float = (int)image_manager->is_float_image(filename, NULL, is_linear); + } + else { + bool is_float_bool; + slot = image_manager->add_image(filename, builtin_data, + animated, 0, is_float_bool, is_linear, + interpolation, use_alpha); + is_float = (int)is_float_bool; + } + } - compiler.parameter("filename", filename.c_str()); + if(slot == -1) { + compiler.parameter("filename", filename.c_str()); + } + else { + /* TODO(sergey): It's not so simple to pass custom attribute + * to the texture() function in order to make builtin images + * support more clear. So we use special file name which is + * "@<slot_number>" and check whether file name matches this + * mask in the OSLRenderServices::texture(). + */ + compiler.parameter("filename", string_printf("@%d", slot).c_str()); + } if(is_linear || color_space != "Color") compiler.parameter("color_space", "Linear"); else @@ -408,7 +432,9 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler) image_manager = compiler.image_manager; if(slot == -1) { bool is_float_bool; - slot = image_manager->add_image(filename, builtin_data, animated, is_float_bool, is_linear, INTERPOLATION_LINEAR, use_alpha); + slot = image_manager->add_image(filename, builtin_data, + animated, 0, is_float_bool, is_linear, + INTERPOLATION_LINEAR, use_alpha); is_float = (int)is_float_bool; } @@ -459,10 +485,29 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler) tex_mapping.compile(compiler); - if(is_float == -1) - is_float = (int)image_manager->is_float_image(filename, NULL, is_linear); + /* See comments in ImageTextureNode::compile about support + * of builtin images. + */ + image_manager = compiler.image_manager; + if(is_float == -1) { + if(builtin_data == NULL) { + is_float = (int)image_manager->is_float_image(filename, NULL, is_linear); + } + else { + bool is_float_bool; + slot = image_manager->add_image(filename, builtin_data, + animated, 0, is_float_bool, is_linear, + INTERPOLATION_LINEAR, use_alpha); + is_float = (int)is_float_bool; + } + } - compiler.parameter("filename", filename.c_str()); + if(slot == -1) { + compiler.parameter("filename", filename.c_str()); + } + else { + compiler.parameter("filename", string_printf("@%d", slot).c_str()); + } compiler.parameter("projection", projection); if(is_linear || color_space != "Color") compiler.parameter("color_space", "Linear"); @@ -1543,11 +1588,24 @@ void BsdfNode::compile(OSLCompiler& compiler) assert(0); } -/* Ward BSDF Closure */ +/* Anisotropic BSDF Closure */ -WardBsdfNode::WardBsdfNode() +static ShaderEnum aniso_distribution_init() { - closure = CLOSURE_BSDF_WARD_ID; + ShaderEnum enm; + + enm.insert("Beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID); + enm.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID); + enm.insert("Ashikhmin-Shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID); + + return enm; +} + +ShaderEnum AnisotropicBsdfNode::distribution_enum = aniso_distribution_init(); + +AnisotropicBsdfNode::AnisotropicBsdfNode() +{ + distribution = ustring("GGX"); add_input("Tangent", SHADER_SOCKET_VECTOR, ShaderInput::TANGENT); @@ -1556,7 +1614,7 @@ WardBsdfNode::WardBsdfNode() add_input("Rotation", SHADER_SOCKET_FLOAT, 0.0f); } -void WardBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes) +void AnisotropicBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes) { if(shader->has_surface) { ShaderInput *tangent_in = input("Tangent"); @@ -1568,14 +1626,17 @@ void WardBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes) ShaderNode::attributes(shader, attributes); } -void WardBsdfNode::compile(SVMCompiler& compiler) +void AnisotropicBsdfNode::compile(SVMCompiler& compiler) { + closure = (ClosureType)distribution_enum[distribution]; + BsdfNode::compile(compiler, input("Roughness"), input("Anisotropy"), input("Rotation")); } -void WardBsdfNode::compile(OSLCompiler& compiler) +void AnisotropicBsdfNode::compile(OSLCompiler& compiler) { - compiler.add(this, "node_ward_bsdf"); + compiler.parameter("distribution", distribution); + compiler.add(this, "node_anisotropic_bsdf"); } /* Glossy BSDF Closure */ @@ -1587,6 +1648,7 @@ static ShaderEnum glossy_distribution_init() enm.insert("Sharp", CLOSURE_BSDF_REFLECTION_ID); enm.insert("Beckmann", CLOSURE_BSDF_MICROFACET_BECKMANN_ID); enm.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_ID); + enm.insert("Ashikhmin-Shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID); return enm; } @@ -1595,7 +1657,7 @@ ShaderEnum GlossyBsdfNode::distribution_enum = glossy_distribution_init(); GlossyBsdfNode::GlossyBsdfNode() { - distribution = ustring("Beckmann"); + distribution = ustring("GGX"); add_input("Roughness", SHADER_SOCKET_FLOAT, 0.2f); } @@ -1850,8 +1912,6 @@ bool SubsurfaceScatteringNode::has_bssrdf_bump() EmissionNode::EmissionNode() : ShaderNode("emission") { - total_power = false; - add_input("Color", SHADER_SOCKET_COLOR, make_float3(0.8f, 0.8f, 0.8f)); add_input("Strength", SHADER_SOCKET_FLOAT, 10.0f); add_input("SurfaceMixWeight", SHADER_SOCKET_FLOAT, 0.0f, ShaderInput::USE_SVM); @@ -1867,10 +1927,8 @@ void EmissionNode::compile(SVMCompiler& compiler) if(color_in->link || strength_in->link) { compiler.stack_assign(color_in); compiler.stack_assign(strength_in); - compiler.add_node(NODE_EMISSION_WEIGHT, color_in->stack_offset, strength_in->stack_offset, total_power? 1: 0); + compiler.add_node(NODE_EMISSION_WEIGHT, color_in->stack_offset, strength_in->stack_offset); } - else if(total_power) - compiler.add_node(NODE_EMISSION_SET_WEIGHT_TOTAL, color_in->value * strength_in->value.x); else compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color_in->value * strength_in->value.x); @@ -1879,7 +1937,6 @@ void EmissionNode::compile(SVMCompiler& compiler) void EmissionNode::compile(OSLCompiler& compiler) { - compiler.parameter("TotalPower", (total_power)? 1: 0); compiler.add(this, "node_emission"); } @@ -3007,13 +3064,13 @@ void CombineRGBNode::compile(SVMCompiler& compiler) compiler.stack_assign(color_out); compiler.stack_assign(red_in); - compiler.add_node(NODE_COMBINE_RGB, red_in->stack_offset, 0, color_out->stack_offset); + compiler.add_node(NODE_COMBINE_VECTOR, red_in->stack_offset, 0, color_out->stack_offset); compiler.stack_assign(green_in); - compiler.add_node(NODE_COMBINE_RGB, green_in->stack_offset, 1, color_out->stack_offset); + compiler.add_node(NODE_COMBINE_VECTOR, green_in->stack_offset, 1, color_out->stack_offset); compiler.stack_assign(blue_in); - compiler.add_node(NODE_COMBINE_RGB, blue_in->stack_offset, 2, color_out->stack_offset); + compiler.add_node(NODE_COMBINE_VECTOR, blue_in->stack_offset, 2, color_out->stack_offset); } void CombineRGBNode::compile(OSLCompiler& compiler) @@ -3021,6 +3078,40 @@ void CombineRGBNode::compile(OSLCompiler& compiler) compiler.add(this, "node_combine_rgb"); } +/* Combine XYZ */ +CombineXYZNode::CombineXYZNode() +: ShaderNode("combine_xyz") +{ + add_input("X", SHADER_SOCKET_FLOAT); + add_input("Y", SHADER_SOCKET_FLOAT); + add_input("Z", SHADER_SOCKET_FLOAT); + add_output("Vector", SHADER_SOCKET_VECTOR); +} + +void CombineXYZNode::compile(SVMCompiler& compiler) +{ + ShaderInput *x_in = input("X"); + ShaderInput *y_in = input("Y"); + ShaderInput *z_in = input("Z"); + ShaderOutput *vector_out = output("Vector"); + + compiler.stack_assign(vector_out); + + compiler.stack_assign(x_in); + compiler.add_node(NODE_COMBINE_VECTOR, x_in->stack_offset, 0, vector_out->stack_offset); + + compiler.stack_assign(y_in); + compiler.add_node(NODE_COMBINE_VECTOR, y_in->stack_offset, 1, vector_out->stack_offset); + + compiler.stack_assign(z_in); + compiler.add_node(NODE_COMBINE_VECTOR, z_in->stack_offset, 2, vector_out->stack_offset); +} + +void CombineXYZNode::compile(OSLCompiler& compiler) +{ + compiler.add(this, "node_combine_xyz"); +} + /* Combine HSV */ CombineHSVNode::CombineHSVNode() : ShaderNode("combine_hsv") @@ -3131,13 +3222,13 @@ void SeparateRGBNode::compile(SVMCompiler& compiler) compiler.stack_assign(color_in); compiler.stack_assign(red_out); - compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 0, red_out->stack_offset); + compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 0, red_out->stack_offset); compiler.stack_assign(green_out); - compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 1, green_out->stack_offset); + compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 1, green_out->stack_offset); compiler.stack_assign(blue_out); - compiler.add_node(NODE_SEPARATE_RGB, color_in->stack_offset, 2, blue_out->stack_offset); + compiler.add_node(NODE_SEPARATE_VECTOR, color_in->stack_offset, 2, blue_out->stack_offset); } void SeparateRGBNode::compile(OSLCompiler& compiler) @@ -3145,6 +3236,40 @@ void SeparateRGBNode::compile(OSLCompiler& compiler) compiler.add(this, "node_separate_rgb"); } +/* Separate XYZ */ +SeparateXYZNode::SeparateXYZNode() +: ShaderNode("separate_xyz") +{ + add_input("Vector", SHADER_SOCKET_VECTOR); + add_output("X", SHADER_SOCKET_FLOAT); + add_output("Y", SHADER_SOCKET_FLOAT); + add_output("Z", SHADER_SOCKET_FLOAT); +} + +void SeparateXYZNode::compile(SVMCompiler& compiler) +{ + ShaderInput *vector_in = input("Vector"); + ShaderOutput *x_out = output("X"); + ShaderOutput *y_out = output("Y"); + ShaderOutput *z_out = output("Z"); + + compiler.stack_assign(vector_in); + + compiler.stack_assign(x_out); + compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 0, x_out->stack_offset); + + compiler.stack_assign(y_out); + compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 1, y_out->stack_offset); + + compiler.stack_assign(z_out); + compiler.add_node(NODE_SEPARATE_VECTOR, vector_in->stack_offset, 2, z_out->stack_offset); +} + +void SeparateXYZNode::compile(OSLCompiler& compiler) +{ + compiler.add(this, "node_separate_xyz"); +} + /* Separate HSV */ SeparateHSVNode::SeparateHSVNode() : ShaderNode("separate_hsv") @@ -4126,4 +4251,3 @@ void TangentNode::compile(OSLCompiler& compiler) } CCL_NAMESPACE_END - diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index d94d8ce6033..31b6f4e50c4 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -218,9 +218,13 @@ public: bool scattering; }; -class WardBsdfNode : public BsdfNode { +class AnisotropicBsdfNode : public BsdfNode { public: - SHADER_NODE_CLASS(WardBsdfNode) + SHADER_NODE_CLASS(AnisotropicBsdfNode) + + ustring distribution; + static ShaderEnum distribution_enum; + void attributes(Shader *shader, AttributeRequestSet *attributes); }; @@ -294,8 +298,6 @@ public: bool has_surface_emission() { return true; } bool has_spatial_varying() { return true; } - - bool total_power; }; class BackgroundNode : public ShaderNode { @@ -453,6 +455,11 @@ public: SHADER_NODE_CLASS(CombineHSVNode) }; +class CombineXYZNode : public ShaderNode { +public: + SHADER_NODE_CLASS(CombineXYZNode) +}; + class GammaNode : public ShaderNode { public: SHADER_NODE_CLASS(GammaNode) @@ -473,6 +480,11 @@ public: SHADER_NODE_CLASS(SeparateHSVNode) }; +class SeparateXYZNode : public ShaderNode { +public: + SHADER_NODE_CLASS(SeparateXYZNode) +}; + class HSVNode : public ShaderNode { public: SHADER_NODE_CLASS(HSVNode) diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index 027bfd71931..46ddab235d9 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -75,8 +75,14 @@ void Object::compute_bounds(bool motion_blur) bounds.grow(mbounds.transformed(&ttfm)); } } - else - bounds = mbounds.transformed(&tfm); + else { + if(mesh->transform_applied) { + bounds = mbounds; + } + else { + bounds = mbounds.transformed(&tfm); + } + } } void Object::apply_transform(bool apply_to_motion) @@ -372,8 +378,6 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc device_free(device, dscene); - need_update = false; - if(scene->objects.size() == 0) return; @@ -392,6 +396,46 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc progress.set_status("Updating Objects", "Applying Static Transformations"); apply_static_transforms(dscene, scene, object_flag, progress); } +} + +void ObjectManager::device_update_flags(Device *device, DeviceScene *dscene, + Scene *scene, Progress& progress) +{ + if(!need_update) + return; + + need_update = false; + + if(scene->objects.size() == 0) + return; + + /* object info flag */ + uint *object_flag = dscene->object_flag.get_data(); + + vector<Object *> volume_objects; + foreach(Object *object, scene->objects) { + if(object->mesh->has_volume) { + volume_objects.push_back(object); + } + } + + int object_index = 0; + foreach(Object *object, scene->objects) { + if(object->mesh->has_volume) { + object_flag[object_index] |= SD_OBJECT_HAS_VOLUME; + } + + foreach(Object *volume_object, volume_objects) { + if(object == volume_object) { + continue; + } + if(object->bounds.intersects(volume_object->bounds)) { + object_flag[object_index] |= SD_OBJECT_INTERSECTS_VOLUME; + break; + } + } + ++object_index; + } /* allocate object flag */ device->tex_alloc("__object_flag", dscene->object_flag); @@ -449,6 +493,8 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u } object_flag[i] |= SD_TRANSFORM_APPLIED; + if(object->mesh->transform_negative_scaled) + object_flag[i] |= SD_NEGATIVE_SCALE_APPLIED; } else have_instancing = true; diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 677526b715f..2c69b83a2e9 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -76,6 +76,7 @@ public: void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); void device_update_transforms(Device *device, DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress); + void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); void device_free(Device *device, DeviceScene *dscene); void tag_update(Scene *scene); diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 94866102f60..f57e16471a1 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -248,20 +248,27 @@ void OSLShaderManager::shading_system_free() bool OSLShaderManager::osl_compile(const string& inputfile, const string& outputfile) { - vector<string> options; +#if OSL_LIBRARY_VERSION_CODE < 10500 + typedef string string_view; +#endif + + vector<string_view> options; string stdosl_path; + string shader_path = path_get("shader"); /* specify output file name */ options.push_back("-o"); options.push_back(outputfile); /* specify standard include path */ - options.push_back("-I" + path_get("shader")); + options.push_back("-I"); + options.push_back(shader_path); + stdosl_path = path_get("shader/stdosl.h"); /* compile */ OSL::OSLCompiler *compiler = OSL::OSLCompiler::create(); - bool ok = compiler->compile(inputfile, options, stdosl_path); + bool ok = compiler->compile(string_view(inputfile), options, string_view(stdosl_path)); delete compiler; return ok; diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 4f5ad439520..3662c29587e 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -61,7 +61,7 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_) if(device_info_.type == DEVICE_CPU) shader_manager = ShaderManager::create(this, params.shadingsystem); else - shader_manager = ShaderManager::create(this, SceneParams::SVM); + shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM); /* Extended image limits for CPU and GPUs */ image_manager->set_extended_image_limits(device_info_); @@ -109,6 +109,8 @@ void Scene::free_memory(bool final) if(!params.persistent_data || final) image_manager->device_free(device, &dscene); + else + image_manager->device_free_builtin(device, &dscene); lookup_tables->device_free(device, &dscene); } @@ -139,7 +141,7 @@ void Scene::device_update(Device *device_, Progress& progress) * the different managers, using data computed by previous managers. * * - Image manager uploads images used by shaders. - * - Camera may be used for adapative subdivison. + * - Camera may be used for adaptive subdivision. * - Displacement shader must have all shader data available. * - Light manager needs lookup tables and final mesh data to compute emission CDF. * - Film needs light manager to run for use_light_visibility @@ -163,13 +165,18 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel()) return; - progress.set_status("Updating Camera"); - camera->device_update(device, &dscene, this); + progress.set_status("Updating Objects"); + object_manager->device_update(device, &dscene, this, progress); if(progress.get_cancel()) return; - progress.set_status("Updating Objects"); - object_manager->device_update(device, &dscene, this, progress); + progress.set_status("Updating Meshes"); + mesh_manager->device_update(device, &dscene, this, progress); + + if(progress.get_cancel()) return; + + progress.set_status("Updating Objects Flags"); + object_manager->device_update_flags(device, &dscene, this, progress); if(progress.get_cancel()) return; @@ -183,8 +190,9 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel()) return; - progress.set_status("Updating Meshes"); - mesh_manager->device_update(device, &dscene, this, progress); + /* TODO(sergey): Make sure camera is not needed above. */ + progress.set_status("Updating Camera"); + camera->device_update(device, &dscene, this); if(progress.get_cancel()) return; @@ -269,7 +277,8 @@ bool Scene::need_reset() || shader_manager->need_update || particle_system_manager->need_update || curve_system_manager->need_update - || bake_manager->need_update); + || bake_manager->need_update + || film->need_update); } void Scene::reset() @@ -282,6 +291,11 @@ void Scene::reset() film->tag_update(this); background->tag_update(this); integrator->tag_update(this); + object_manager->tag_update(this); + mesh_manager->tag_update(this); + light_manager->tag_update(this); + particle_system_manager->tag_update(this); + curve_system_manager->tag_update(this); } void Scene::device_free() diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 0f0bb725823..5d205225d97 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -18,6 +18,7 @@ #define __SCENE_H__ #include "image.h" +#include "shader.h" #include "device_memory.h" @@ -68,7 +69,7 @@ public: device_vector<uint> prim_object; /* mesh */ - device_vector<float4> tri_normal; + device_vector<uint> tri_shader; device_vector<float4> tri_vnormal; device_vector<float4> tri_vindex; device_vector<float4> tri_verts; @@ -84,6 +85,7 @@ public: device_vector<uint4> attributes_map; device_vector<float> attributes_float; device_vector<float4> attributes_float3; + device_vector<uchar4> attributes_uchar4; /* lights */ device_vector<float4> light_distribution; @@ -120,7 +122,7 @@ public: class SceneParams { public: - enum { OSL, SVM } shadingsystem; + ShadingSystem shadingsystem; enum BVHType { BVH_DYNAMIC, BVH_STATIC } bvh_type; bool use_bvh_cache; bool use_bvh_spatial_split; @@ -129,7 +131,7 @@ public: SceneParams() { - shadingsystem = SVM; + shadingsystem = SHADINGSYSTEM_SVM; bvh_type = BVH_DYNAMIC; use_bvh_cache = false; use_bvh_spatial_split = false; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 28b44df6b36..9fcd9fa85f5 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -592,9 +592,10 @@ void Session::run_cpu() update_progressive_refine(true); } -void Session::run() +void Session::load_kernels() { - /* load kernels */ + thread_scoped_lock scene_lock(scene->mutex); + if(!kernels_loaded) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); @@ -603,6 +604,7 @@ void Session::run() if(message.empty()) message = "Failed loading render kernel, see console for errors"; + progress.set_cancel(message); progress.set_status("Error", message); progress.set_update(); return; @@ -610,6 +612,12 @@ void Session::run() kernels_loaded = true; } +} + +void Session::run() +{ + /* load kernels */ + load_kernels(); /* session thread loop */ progress.set_status("Waiting for render to start"); diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 1e625158652..9da7a0aafa3 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -19,6 +19,7 @@ #include "buffers.h" #include "device.h" +#include "shader.h" #include "tile.h" #include "util_progress.h" @@ -59,7 +60,7 @@ public: double reset_timeout; double text_timeout; - enum { OSL, SVM } shadingsystem; + ShadingSystem shadingsystem; SessionParams() { @@ -80,7 +81,7 @@ public: reset_timeout = 0.1; text_timeout = 1.0; - shadingsystem = SVM; + shadingsystem = SHADINGSYSTEM_SVM; tile_order = TILE_CENTER; } @@ -137,7 +138,10 @@ public: void set_pause(bool pause); void update_scene(); + void load_kernels(); + void device_free(); + protected: struct DelayedReset { thread_mutex mutex; diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index b25673b36c3..2a3969b6188 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -31,6 +31,100 @@ CCL_NAMESPACE_BEGIN +/* Beckmann sampling precomputed table, see bsdf_microfacet.h */ + +/* 2D slope distribution (alpha = 1.0) */ +static float beckmann_table_P22(const float slope_x, const float slope_y) +{ + return expf(-(slope_x*slope_x + slope_y*slope_y)); +} + +/* maximal slope amplitude (range that contains 99.99% of the distribution) */ +static float beckmann_table_slope_max() +{ + return 6.0; +} + +/* Paper used: Importance Sampling Microfacet-Based BSDFs with the + * Distribution of Visible Normals. Supplemental Material 2/2. + * + * http://hal.inria.fr/docs/01/00/66/20/ANNEX/supplemental2.pdf + */ +static void beckmann_table_rows(float *table, int row_from, int row_to) +{ + /* allocate temporary data */ + const int DATA_TMP_SIZE = 512; + vector<double> slope_x(DATA_TMP_SIZE); + vector<double> CDF_P22_omega_i(DATA_TMP_SIZE); + + /* loop over incident directions */ + for(int index_theta = row_from; index_theta < row_to; index_theta++) { + /* incident vector */ + const float cos_theta = index_theta / (BECKMANN_TABLE_SIZE - 1.0f); + const float sin_theta = safe_sqrtf(1.0f - cos_theta*cos_theta); + + /* for a given incident vector + * integrate P22_{omega_i}(x_slope, 1, 1), Eq. (10) */ + slope_x[0] = -beckmann_table_slope_max(); + CDF_P22_omega_i[0] = 0; + + for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) { + /* slope_x */ + slope_x[index_slope_x] = -beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f); + + /* dot product with incident vector */ + float dot_product = fmaxf(0.0f, -(float)slope_x[index_slope_x]*sin_theta + cos_theta); + /* marginalize P22_{omega_i}(x_slope, 1, 1), Eq. (10) */ + float P22_omega_i = 0.0f; + + for(int j = 0; j < 100; ++j) { + float slope_y = -beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * j * (1.0f/99.0f); + P22_omega_i += dot_product * beckmann_table_P22((float)slope_x[index_slope_x], slope_y); + } + + /* CDF of P22_{omega_i}(x_slope, 1, 1), Eq. (10) */ + CDF_P22_omega_i[index_slope_x] = CDF_P22_omega_i[index_slope_x - 1] + (double)P22_omega_i; + } + + /* renormalize CDF_P22_omega_i */ + for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) + CDF_P22_omega_i[index_slope_x] /= CDF_P22_omega_i[DATA_TMP_SIZE - 1]; + + /* loop over random number U1 */ + int index_slope_x = 0; + + for(int index_U = 0; index_U < BECKMANN_TABLE_SIZE; ++index_U) { + const double U = 0.0000001 + 0.9999998 * index_U / (double)(BECKMANN_TABLE_SIZE - 1); + + /* inverse CDF_P22_omega_i, solve Eq.(11) */ + while(CDF_P22_omega_i[index_slope_x] <= U) + ++index_slope_x; + + const double interp = + (CDF_P22_omega_i[index_slope_x] - U) / + (CDF_P22_omega_i[index_slope_x] - CDF_P22_omega_i[index_slope_x - 1]); + + /* store value */ + table[index_U + index_theta*BECKMANN_TABLE_SIZE] = (float)( + interp * slope_x[index_slope_x - 1] + + (1.0 - interp) * slope_x[index_slope_x]); + } + } +} + +static void beckmann_table_build(vector<float>& table) +{ + table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE); + + /* multithreaded build */ + TaskPool pool; + + for(int i = 0; i < BECKMANN_TABLE_SIZE; i+=8) + pool.push(function_bind(&beckmann_table_rows, &table[0], i, i+8)); + + pool.wait_work(); +} + /* Shader */ Shader::Shader() @@ -44,6 +138,8 @@ Shader::Shader() use_mis = true; use_transparent_shadow = true; heterogeneous_volume = true; + volume_sampling_method = VOLUME_SAMPLING_DISTANCE; + volume_interpolation_method = VOLUME_INTERPOLATION_LINEAR; has_surface = false; has_surface_transparent = false; @@ -137,6 +233,7 @@ ShaderManager::ShaderManager() { need_update = true; blackbody_table_offset = TABLE_OFFSET_INVALID; + beckmann_table_offset = TABLE_OFFSET_INVALID; } ShaderManager::~ShaderManager() @@ -148,7 +245,7 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem) ShaderManager *manager; #ifdef WITH_OSL - if(shadingsystem == SceneParams::OSL) + if(shadingsystem == SHADINGSYSTEM_OSL) manager = new OSLShaderManager(); else #endif @@ -256,6 +353,12 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc flag |= SD_HAS_BSSRDF_BUMP; if(shader->has_converter_blackbody) has_converter_blackbody = true; + if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR) + flag |= SD_VOLUME_EQUIANGULAR; + if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE) + flag |= SD_VOLUME_MIS; + if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC) + flag |= SD_VOLUME_CUBIC; /* regular shader */ shader_flag[i++] = flag; @@ -272,20 +375,29 @@ void ShaderManager::device_update_common(Device *device, DeviceScene *dscene, Sc device->tex_alloc("__shader_flag", dscene->shader_flag); /* blackbody lookup table */ - KernelBlackbody *kblackbody = &dscene->data.blackbody; + KernelTables *ktables = &dscene->data.tables; if(has_converter_blackbody && blackbody_table_offset == TABLE_OFFSET_INVALID) { vector<float> table = blackbody_table(); blackbody_table_offset = scene->lookup_tables->add_table(dscene, table); - kblackbody->table_offset = (int)blackbody_table_offset; + ktables->blackbody_offset = (int)blackbody_table_offset; } else if(!has_converter_blackbody && blackbody_table_offset != TABLE_OFFSET_INVALID) { scene->lookup_tables->remove_table(blackbody_table_offset); blackbody_table_offset = TABLE_OFFSET_INVALID; } - /* volumes */ + /* beckmann lookup table */ + if(beckmann_table_offset == TABLE_OFFSET_INVALID) { + vector<float> table; + beckmann_table_build(table); + beckmann_table_offset = scene->lookup_tables->add_table(dscene, table); + + ktables->beckmann_offset = (int)beckmann_table_offset; + } + + /* integrator */ KernelIntegrator *kintegrator = &dscene->data.integrator; kintegrator->use_volumes = has_volumes; } @@ -297,6 +409,11 @@ void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scen blackbody_table_offset = TABLE_OFFSET_INVALID; } + if(beckmann_table_offset != TABLE_OFFSET_INVALID) { + scene->lookup_tables->remove_table(beckmann_table_offset); + beckmann_table_offset = TABLE_OFFSET_INVALID; + } + device->tex_free(dscene->shader_flag); dscene->shader_flag.clear(); } diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index 874e8face7a..b267731abe5 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -17,6 +17,10 @@ #ifndef __SHADER_H__ #define __SHADER_H__ +#ifdef WITH_OSL +# include <OSL/oslexec.h> +#endif + #include "attribute.h" #include "kernel_types.h" @@ -25,10 +29,6 @@ #include "util_string.h" #include "util_types.h" -#ifdef WITH_OSL -#include <OSL/oslexec.h> -#endif - CCL_NAMESPACE_BEGIN class Device; @@ -39,6 +39,23 @@ class Scene; class ShaderGraph; struct float3; +enum ShadingSystem { + SHADINGSYSTEM_OSL, + SHADINGSYSTEM_SVM +}; + +/* Keep those in sync with the python-defined enum. */ +enum VolumeSampling { + VOLUME_SAMPLING_DISTANCE = 0, + VOLUME_SAMPLING_EQUIANGULAR = 1, + VOLUME_SAMPLING_MULTIPLE_IMPORTANCE = 2, +}; + +enum VolumeInterpolation { + VOLUME_INTERPOLATION_LINEAR = 0, + VOLUME_INTERPOLATION_CUBIC = 1, +}; + /* Shader describing the appearance of a Mesh, Light or Background. * * While there is only a single shader graph, it has three outputs: surface, @@ -63,6 +80,8 @@ public: bool use_mis; bool use_transparent_shadow; bool heterogeneous_volume; + VolumeSampling volume_sampling_method; + int volume_interpolation_method; /* synchronization */ bool need_update; @@ -143,6 +162,7 @@ protected: AttributeIDMap unique_attribute_id; size_t blackbody_table_offset; + size_t beckmann_table_offset; }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index 576c176759c..13c63d9420c 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -363,14 +363,17 @@ bool SVMCompiler::node_skip_input(ShaderNode *node, ShaderInput *input) return false; } -void SVMCompiler::find_dependencies(set<ShaderNode*>& dependencies, const set<ShaderNode*>& done, ShaderInput *input) +void SVMCompiler::find_dependencies(set<ShaderNode*>& dependencies, + const set<ShaderNode*>& done, + ShaderInput *input, + ShaderNode *skip_node) { ShaderNode *node = (input->link)? input->link->parent: NULL; - if(node && done.find(node) == done.end()) { + if(node && done.find(node) == done.end() && node != skip_node) { foreach(ShaderInput *in, node->inputs) if(!node_skip_input(node, in)) - find_dependencies(dependencies, done, in); + find_dependencies(dependencies, done, in, skip_node); dependencies.insert(node); } @@ -459,20 +462,28 @@ void SVMCompiler::generate_closure_node(ShaderNode *node, set<ShaderNode*>& done } } -void SVMCompiler::generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared) +void SVMCompiler::generated_shared_closure_nodes(ShaderNode *root_node, + ShaderNode *node, + set<ShaderNode*>& done, + set<ShaderNode*>& closure_done, + const set<ShaderNode*>& shared) { if(shared.find(node) != shared.end()) { - generate_multi_closure(node, done, closure_done); + generate_multi_closure(root_node, node, done, closure_done); } else { foreach(ShaderInput *in, node->inputs) { if(in->type == SHADER_SOCKET_CLOSURE && in->link) - generated_shared_closure_nodes(in->link->parent, done, closure_done, shared); + generated_shared_closure_nodes(root_node, in->link->parent, + done, closure_done, shared); } } } -void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done) +void SVMCompiler::generate_multi_closure(ShaderNode *root_node, + ShaderNode *node, + set<ShaderNode*>& done, + set<ShaderNode*>& closure_done) { /* only generate once */ if(closure_done.find(node) != closure_done.end()) @@ -509,12 +520,33 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don set_intersection(cl1deps.begin(), cl1deps.end(), cl2deps.begin(), cl2deps.end(), std::inserter(shareddeps, shareddeps.begin())); - + + /* it's possible some nodes are not shared between this mix node + * inputs, but still needed to be always executed, this mainly + * happens when a node of current subbranch is used by a parent + * node or so */ + if(root_node != node) { + foreach(ShaderInput *in, root_node->inputs) { + set<ShaderNode*> rootdeps; + find_dependencies(rootdeps, done, in, node); + set_intersection(rootdeps.begin(), rootdeps.end(), + cl1deps.begin(), cl1deps.end(), + std::inserter(shareddeps, shareddeps.begin())); + set_intersection(rootdeps.begin(), rootdeps.end(), + cl2deps.begin(), cl2deps.end(), + std::inserter(shareddeps, shareddeps.begin())); + } + } + if(!shareddeps.empty()) { - if(cl1in->link) - generated_shared_closure_nodes(cl1in->link->parent, done, closure_done, shareddeps); - if(cl2in->link) - generated_shared_closure_nodes(cl2in->link->parent, done, closure_done, shareddeps); + if(cl1in->link) { + generated_shared_closure_nodes(root_node, cl1in->link->parent, + done, closure_done, shareddeps); + } + if(cl2in->link) { + generated_shared_closure_nodes(root_node, cl2in->link->parent, + done, closure_done, shareddeps); + } generate_svm_nodes(shareddeps, done); } @@ -525,7 +557,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE, 0, facin->stack_offset, 0)); int node_jump_skip_index = svm_nodes.size() - 1; - generate_multi_closure(cl1in->link->parent, done, closure_done); + generate_multi_closure(root_node, cl1in->link->parent, done, closure_done); /* fill in jump instruction location to be after closure */ svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1; @@ -537,7 +569,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO, 0, facin->stack_offset, 0)); int node_jump_skip_index = svm_nodes.size() - 1; - generate_multi_closure(cl2in->link->parent, done, closure_done); + generate_multi_closure(root_node, cl2in->link->parent, done, closure_done); /* fill in jump instruction location to be after closure */ svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1; @@ -551,9 +583,9 @@ void SVMCompiler::generate_multi_closure(ShaderNode *node, set<ShaderNode*>& don * to skip closures here because was already optimized due to * fixed weight or add closure that always needs both */ if(cl1in->link) - generate_multi_closure(cl1in->link->parent, done, closure_done); + generate_multi_closure(root_node, cl1in->link->parent, done, closure_done); if(cl2in->link) - generate_multi_closure(cl2in->link->parent, done, closure_done); + generate_multi_closure(root_node, cl2in->link->parent, done, closure_done); } } else { @@ -638,7 +670,8 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty if(generate) { set<ShaderNode*> done, closure_done; - generate_multi_closure(clin->link->parent, done, closure_done); + generate_multi_closure(clin->link->parent, clin->link->parent, + done, closure_done); } } diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index 45aa4d26926..c1dd96e4d80 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -123,15 +123,21 @@ protected: /* single closure */ void find_dependencies(set<ShaderNode*>& dependencies, - const set<ShaderNode*>& done, ShaderInput *input); + const set<ShaderNode*>& done, + ShaderInput *input, + ShaderNode *skip_node = NULL); void generate_node(ShaderNode *node, set<ShaderNode*>& done); void generate_closure_node(ShaderNode *node, set<ShaderNode*>& done); - void generated_shared_closure_nodes(ShaderNode *node, set<ShaderNode*>& done, + void generated_shared_closure_nodes(ShaderNode *root_node, ShaderNode *node, + set<ShaderNode*>& done, set<ShaderNode*>& closure_done, const set<ShaderNode*>& shared); void generate_svm_nodes(const set<ShaderNode*>& nodes, set<ShaderNode*>& done); /* multi closure */ - void generate_multi_closure(ShaderNode *node, set<ShaderNode*>& done, set<ShaderNode*>& closure_done); + void generate_multi_closure(ShaderNode *root_node, + ShaderNode *node, + set<ShaderNode*>& done, + set<ShaderNode*>& closure_done); /* compile */ void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type); diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 72bcdf966b5..e37d8e5f8a1 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -200,9 +200,9 @@ list<Tile>::iterator TileManager::next_background_tile(int device, TileOrder til switch (tile_order) { case TILE_CENTER: - distx = centx - (cur_tile.x + cur_tile.w); - disty = centy - (cur_tile.y + cur_tile.h); - distx = (int64_t) sqrt((double)distx * distx + disty * disty); + distx = centx - (cur_tile.x + (cur_tile.w / 2)); + disty = centy - (cur_tile.y + (cur_tile.h / 2)); + distx = (int64_t)sqrt((double)(distx * distx + disty * disty)); break; case TILE_RIGHT_TO_LEFT: distx = cordx - cur_tile.x; diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index c1150d226ae..842d5efac79 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -1,21 +1,21 @@ set(INC . + ../../glew-mx ) set(INC_SYS ${GLEW_INCLUDE_PATH} - ${OPENGL_INCLUDE_DIR} ) set(SRC util_cache.cpp - util_cuda.cpp util_dynlib.cpp + util_logging.cpp util_md5.cpp - util_opencl.cpp util_path.cpp util_string.cpp + util_simd.cpp util_system.cpp util_task.cpp util_time.cpp @@ -33,7 +33,6 @@ set(SRC_HEADERS util_args.h util_boundbox.h util_cache.h - util_cuda.h util_debug.h util_dynlib.h util_foreach.h @@ -42,10 +41,10 @@ set(SRC_HEADERS util_hash.h util_image.h util_list.h + util_logging.h util_map.h util_math.h util_md5.h - util_opencl.h util_opengl.h util_optimization.h util_param.h @@ -53,6 +52,9 @@ set(SRC_HEADERS util_progress.h util_set.h util_simd.h + util_sseb.h + util_ssef.h + util_ssei.h util_stats.h util_string.h util_system.h @@ -69,4 +71,6 @@ set(SRC_HEADERS include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) +add_definitions(${GL_DEFINITIONS}) + add_library(cycles_util ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index 369082af60a..a71e0399619 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -167,6 +167,15 @@ public: return result; } + + __forceinline bool intersects(const BoundBox& other) + { + float3 center_diff = center() - other.center(), + total_size = (size() + other.size()) * 0.5f; + return fabsf(center_diff.x) <= total_size.x && + fabsf(center_diff.y) <= total_size.y && + fabsf(center_diff.z) <= total_size.z; + } }; __forceinline BoundBox merge(const BoundBox& bbox, const float3& pt) diff --git a/intern/cycles/util/util_cache.h b/intern/cycles/util/util_cache.h index 417f4a869b6..bfb2877a22b 100644 --- a/intern/cycles/util/util_cache.h +++ b/intern/cycles/util/util_cache.h @@ -25,7 +25,7 @@ * again into the appropriate data structures. * * This way we do not need to accurately track changes, compare dates and - * invalidate cache entries, at the cost of exta computation. If everything + * invalidate cache entries, at the cost of extra computation. If everything * is stored in a global cache, computations can perhaps even be shared between * different scenes where it may be hard to detect duplicate work. */ @@ -96,54 +96,70 @@ public: buffers.push_back(buffer); } - template<typename T> void read(array<T>& data) + template<typename T> bool read(array<T>& data) { size_t size; if(!fread(&size, sizeof(size), 1, f)) { fprintf(stderr, "Failed to read vector size from cache.\n"); - return; + return false; } if(!size) - return; + return false; data.resize(size/sizeof(T)); if(!fread(&data[0], size, 1, f)) { fprintf(stderr, "Failed to read vector data from cache (%lu).\n", (unsigned long)size); - return; + return false; } + return true; } - void read(int& data) + bool read(int& data) { size_t size; - if(!fread(&size, sizeof(size), 1, f)) + if(!fread(&size, sizeof(size), 1, f)) { fprintf(stderr, "Failed to read int size from cache.\n"); - if(!fread(&data, sizeof(data), 1, f)) + return false; + } + if(!fread(&data, sizeof(data), 1, f)) { fprintf(stderr, "Failed to read int from cache.\n"); + return false; + } + return true; } - void read(float& data) + bool read(float& data) { size_t size; - if(!fread(&size, sizeof(size), 1, f)) + if(!fread(&size, sizeof(size), 1, f)) { fprintf(stderr, "Failed to read float size from cache.\n"); - if(!fread(&data, sizeof(data), 1, f)) + return false; + } + if(!fread(&data, sizeof(data), 1, f)) { fprintf(stderr, "Failed to read float from cache.\n"); + return false; + } + return true; } - void read(size_t& data) + bool read(size_t& data) { size_t size; - if(!fread(&size, sizeof(size), 1, f)) + if(!fread(&size, sizeof(size), 1, f)) { fprintf(stderr, "Failed to read size_t size from cache.\n"); - if(!fread(&data, sizeof(data), 1, f)) + return false; + } + if(!fread(&data, sizeof(data), 1, f)) { fprintf(stderr, "Failed to read size_t from cache.\n"); + return false; + } + return true; } }; diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index b72cc6bc873..53b3d72de67 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -26,6 +26,27 @@ CCL_NAMESPACE_BEGIN +ccl_device uchar float_to_byte(float val) +{ + return ((val <= 0.0f) ? 0 : ((val > (1.0f - 0.5f / 255.0f)) ? 255 : (uchar)((255.0f * val) + 0.5f))); +} + +ccl_device uchar4 color_float_to_byte(float3 c) +{ + uchar r, g, b; + + r = float_to_byte(c.x); + g = float_to_byte(c.y); + b = float_to_byte(c.z); + + return make_uchar4(r, g, b, 0); +} + +ccl_device_inline float3 color_byte_to_float(uchar4 c) +{ + return make_float3(c.x*(1.0f/255.0f), c.y*(1.0f/255.0f), c.z*(1.0f/255.0f)); +} + ccl_device float color_srgb_to_scene_linear(float c) { if(c < 0.04045f) @@ -149,34 +170,34 @@ ccl_device float3 color_srgb_to_scene_linear(float3 c) #ifdef __KERNEL_SSE2__ /* * Calculate initial guess for arg^exp based on float representation - * This method gives a constant bias, which can be easily compensated by multiplicating with bias_coeff. + * This method gives a constant bias, which can be easily compensated by multiplication with bias_coeff. * Gives better results for exponents near 1 (e. g. 4/5). * exp = exponent, encoded as uint32_t * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t */ template<unsigned exp, unsigned e2coeff> -ccl_device_inline __m128 fastpow(const __m128 &arg) +ccl_device_inline ssef fastpow(const ssef &arg) { - __m128 ret; - ret = _mm_mul_ps(arg, _mm_castsi128_ps(_mm_set1_epi32(e2coeff))); - ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); - ret = _mm_mul_ps(ret, _mm_castsi128_ps(_mm_set1_epi32(exp))); - ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); + ssef ret; + ret = arg * cast(ssei(e2coeff)); + ret = ssef(cast(ret)); + ret = ret * cast(ssei(exp)); + ret = cast(ssei(ret)); return ret; } /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */ -ccl_device_inline __m128 improve_5throot_solution(const __m128 &old_result, const __m128 &x) +ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x) { - __m128 approx2 = _mm_mul_ps(old_result, old_result); - __m128 approx4 = _mm_mul_ps(approx2, approx2); - __m128 t = _mm_div_ps(x, approx4); - __m128 summ = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(4.0f), old_result), t); /* fma */ - return _mm_mul_ps(summ, _mm_set1_ps(1.0f/5.0f)); + ssef approx2 = old_result * old_result; + ssef approx4 = approx2 * approx2; + ssef t = x / approx4; + ssef summ = madd(ssef(4.0f), old_result, t); + return summ * ssef(1.0f/5.0f); } /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */ -ccl_device_inline __m128 fastpow24(const __m128 &arg) +ccl_device_inline ssef fastpow24(const ssef &arg) { /* max, avg and |avg| errors were calculated in gcc without FMA instructions * The final precision should be better than powf in glibc */ @@ -184,22 +205,22 @@ ccl_device_inline __m128 fastpow24(const __m128 &arg) /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */ /* 0x3F4CCCCD = 4/5 */ /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */ - __m128 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05 - __m128 arg2 = _mm_mul_ps(arg, arg); - __m128 arg4 = _mm_mul_ps(arg2, arg2); + ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg); // error max = 0.17 avg = 0.0018 |avg| = 0.05 + ssef arg2 = arg * arg; + ssef arg4 = arg2 * arg2; x = improve_5throot_solution(x, arg4); /* error max = 0.018 avg = 0.0031 |avg| = 0.0031 */ x = improve_5throot_solution(x, arg4); /* error max = 0.00021 avg = 1.6e-05 |avg| = 1.6e-05 */ x = improve_5throot_solution(x, arg4); /* error max = 6.1e-07 avg = 5.2e-08 |avg| = 1.1e-07 */ - return _mm_mul_ps(x, _mm_mul_ps(x, x)); + return x * (x * x); } -ccl_device __m128 color_srgb_to_scene_linear(const __m128 &c) +ccl_device ssef color_srgb_to_scene_linear(const ssef &c) { - __m128 cmp = _mm_cmplt_ps(c, _mm_set1_ps(0.04045f)); - __m128 lt = _mm_max_ps(_mm_mul_ps(c, _mm_set1_ps(1.0f/12.92f)), _mm_set1_ps(0.0f)); - __m128 gtebase = _mm_mul_ps(_mm_add_ps(c, _mm_set1_ps(0.055f)), _mm_set1_ps(1.0f/1.055f)); /* fma */ - __m128 gte = fastpow24(gtebase); - return blend(cmp, lt, gte); + sseb cmp = c < ssef(0.04045f); + ssef lt = max(c * ssef(1.0f/12.92f), ssef(0.0f)); + ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f/1.055f); /* fma */ + ssef gte = fastpow24(gtebase); + return select(cmp, lt, gte); } #endif diff --git a/intern/cycles/util/util_cuda.cpp b/intern/cycles/util/util_cuda.cpp deleted file mode 100644 index e9140633e4a..00000000000 --- a/intern/cycles/util/util_cuda.cpp +++ /dev/null @@ -1,495 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -#include <iostream> - -#include <stdlib.h> -#include <stdio.h> - -#include "util_cuda.h" -#include "util_debug.h" -#include "util_dynlib.h" -#include "util_path.h" -#include "util_string.h" - -#ifdef _WIN32 -#define popen _popen -#define pclose _pclose -#endif - -/* function defininitions */ - -tcuInit *cuInit; -tcuDriverGetVersion *cuDriverGetVersion; -tcuDeviceGet *cuDeviceGet; -tcuDeviceGetCount *cuDeviceGetCount; -tcuDeviceGetName *cuDeviceGetName; -tcuDeviceComputeCapability *cuDeviceComputeCapability; -tcuDeviceTotalMem *cuDeviceTotalMem; -tcuDeviceGetProperties *cuDeviceGetProperties; -tcuDeviceGetAttribute *cuDeviceGetAttribute; -tcuCtxCreate *cuCtxCreate; -tcuCtxDestroy *cuCtxDestroy; -tcuCtxAttach *cuCtxAttach; -tcuCtxDetach *cuCtxDetach; -tcuCtxPushCurrent *cuCtxPushCurrent; -tcuCtxPopCurrent *cuCtxPopCurrent; -tcuCtxGetDevice *cuCtxGetDevice; -tcuCtxSynchronize *cuCtxSynchronize; -tcuModuleLoad *cuModuleLoad; -tcuModuleLoadData *cuModuleLoadData; -tcuModuleLoadDataEx *cuModuleLoadDataEx; -tcuModuleLoadFatBinary *cuModuleLoadFatBinary; -tcuModuleUnload *cuModuleUnload; -tcuModuleGetFunction *cuModuleGetFunction; -tcuModuleGetGlobal *cuModuleGetGlobal; -tcuModuleGetTexRef *cuModuleGetTexRef; -tcuModuleGetSurfRef *cuModuleGetSurfRef; -tcuMemGetInfo *cuMemGetInfo; -tcuMemAlloc *cuMemAlloc; -tcuMemAllocPitch *cuMemAllocPitch; -tcuMemFree *cuMemFree; -tcuMemGetAddressRange *cuMemGetAddressRange; -tcuMemAllocHost *cuMemAllocHost; -tcuMemFreeHost *cuMemFreeHost; -tcuMemHostAlloc *cuMemHostAlloc; -tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; -tcuMemHostGetFlags *cuMemHostGetFlags; -tcuMemcpyHtoD *cuMemcpyHtoD; -tcuMemcpyDtoH *cuMemcpyDtoH; -tcuMemcpyDtoD *cuMemcpyDtoD; -tcuMemcpyDtoA *cuMemcpyDtoA; -tcuMemcpyAtoD *cuMemcpyAtoD; -tcuMemcpyHtoA *cuMemcpyHtoA; -tcuMemcpyAtoH *cuMemcpyAtoH; -tcuMemcpyAtoA *cuMemcpyAtoA; -tcuMemcpy2D *cuMemcpy2D; -tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; -tcuMemcpy3D *cuMemcpy3D; -tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; -tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; -tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; -tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; -tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; -tcuMemcpy2DAsync *cuMemcpy2DAsync; -tcuMemcpy3DAsync *cuMemcpy3DAsync; -tcuMemsetD8 *cuMemsetD8; -tcuMemsetD16 *cuMemsetD16; -tcuMemsetD32 *cuMemsetD32; -tcuMemsetD2D8 *cuMemsetD2D8; -tcuMemsetD2D16 *cuMemsetD2D16; -tcuMemsetD2D32 *cuMemsetD2D32; -tcuFuncSetBlockShape *cuFuncSetBlockShape; -tcuFuncSetSharedSize *cuFuncSetSharedSize; -tcuFuncGetAttribute *cuFuncGetAttribute; -tcuFuncSetCacheConfig *cuFuncSetCacheConfig; -tcuArrayCreate *cuArrayCreate; -tcuArrayGetDescriptor *cuArrayGetDescriptor; -tcuArrayDestroy *cuArrayDestroy; -tcuArray3DCreate *cuArray3DCreate; -tcuArray3DGetDescriptor *cuArray3DGetDescriptor; -tcuTexRefCreate *cuTexRefCreate; -tcuTexRefDestroy *cuTexRefDestroy; -tcuTexRefSetArray *cuTexRefSetArray; -tcuTexRefSetAddress *cuTexRefSetAddress; -tcuTexRefSetAddress2D *cuTexRefSetAddress2D; -tcuTexRefSetFormat *cuTexRefSetFormat; -tcuTexRefSetAddressMode *cuTexRefSetAddressMode; -tcuTexRefSetFilterMode *cuTexRefSetFilterMode; -tcuTexRefSetFlags *cuTexRefSetFlags; -tcuTexRefGetAddress *cuTexRefGetAddress; -tcuTexRefGetArray *cuTexRefGetArray; -tcuTexRefGetAddressMode *cuTexRefGetAddressMode; -tcuTexRefGetFilterMode *cuTexRefGetFilterMode; -tcuTexRefGetFormat *cuTexRefGetFormat; -tcuTexRefGetFlags *cuTexRefGetFlags; -tcuSurfRefSetArray *cuSurfRefSetArray; -tcuSurfRefGetArray *cuSurfRefGetArray; -tcuParamSetSize *cuParamSetSize; -tcuParamSeti *cuParamSeti; -tcuParamSetf *cuParamSetf; -tcuParamSetv *cuParamSetv; -tcuParamSetTexRef *cuParamSetTexRef; -tcuLaunch *cuLaunch; -tcuLaunchGrid *cuLaunchGrid; -tcuLaunchGridAsync *cuLaunchGridAsync; -tcuEventCreate *cuEventCreate; -tcuEventRecord *cuEventRecord; -tcuEventQuery *cuEventQuery; -tcuEventSynchronize *cuEventSynchronize; -tcuEventDestroy *cuEventDestroy; -tcuEventElapsedTime *cuEventElapsedTime; -tcuStreamCreate *cuStreamCreate; -tcuStreamQuery *cuStreamQuery; -tcuStreamSynchronize *cuStreamSynchronize; -tcuStreamDestroy *cuStreamDestroy; -tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; -tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; -tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; -tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; -tcuGraphicsMapResources *cuGraphicsMapResources; -tcuGraphicsUnmapResources *cuGraphicsUnmapResources; -tcuGetExportTable *cuGetExportTable; -tcuCtxSetLimit *cuCtxSetLimit; -tcuCtxGetLimit *cuCtxGetLimit; -tcuGLCtxCreate *cuGLCtxCreate; -tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; -tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage; -tcuCtxSetCurrent *cuCtxSetCurrent; - -CCL_NAMESPACE_BEGIN - -/* utility macros */ -#define CUDA_LIBRARY_FIND_CHECKED(name) \ - name = (t##name*)dynamic_library_find(lib, #name); - -#define CUDA_LIBRARY_FIND(name) \ - name = (t##name*)dynamic_library_find(lib, #name); \ - assert(name); - -#define CUDA_LIBRARY_FIND_V2(name) \ - name = (t##name*)dynamic_library_find(lib, #name "_v2"); \ - assert(name); - -/* initialization function */ - -bool cuLibraryInit() -{ - static bool initialized = false; - static bool result = false; - - if(initialized) - return result; - - initialized = true; - - /* library paths */ -#ifdef _WIN32 - /* expected in c:/windows/system or similar, no path needed */ - const char *path = "nvcuda.dll"; -#elif defined(__APPLE__) - /* default installation path */ - const char *path = "/usr/local/cuda/lib/libcuda.dylib"; -#else - const char *path = "libcuda.so"; -#endif - - /* load library */ - DynamicLibrary *lib = dynamic_library_open(path); - - if(lib == NULL) - return false; - - /* detect driver version */ - int driver_version = 1000; - - CUDA_LIBRARY_FIND_CHECKED(cuDriverGetVersion); - if(cuDriverGetVersion) - cuDriverGetVersion(&driver_version); - - /* we require version 4.0 */ - if(driver_version < 4000) - return false; - - /* fetch all function pointers */ - CUDA_LIBRARY_FIND(cuInit); - CUDA_LIBRARY_FIND(cuDeviceGet); - CUDA_LIBRARY_FIND(cuDeviceGetCount); - CUDA_LIBRARY_FIND(cuDeviceGetName); - CUDA_LIBRARY_FIND(cuDeviceComputeCapability); - CUDA_LIBRARY_FIND(cuDeviceTotalMem); - CUDA_LIBRARY_FIND(cuDeviceGetProperties); - CUDA_LIBRARY_FIND(cuDeviceGetAttribute); - CUDA_LIBRARY_FIND(cuCtxCreate); - CUDA_LIBRARY_FIND(cuCtxDestroy); - CUDA_LIBRARY_FIND(cuCtxAttach); - CUDA_LIBRARY_FIND(cuCtxDetach); - CUDA_LIBRARY_FIND(cuCtxPushCurrent); - CUDA_LIBRARY_FIND(cuCtxPopCurrent); - CUDA_LIBRARY_FIND(cuCtxGetDevice); - CUDA_LIBRARY_FIND(cuCtxSynchronize); - CUDA_LIBRARY_FIND(cuModuleLoad); - CUDA_LIBRARY_FIND(cuModuleLoadData); - CUDA_LIBRARY_FIND(cuModuleUnload); - CUDA_LIBRARY_FIND(cuModuleGetFunction); - CUDA_LIBRARY_FIND(cuModuleGetGlobal); - CUDA_LIBRARY_FIND(cuModuleGetTexRef); - CUDA_LIBRARY_FIND(cuMemGetInfo); - CUDA_LIBRARY_FIND(cuMemAlloc); - CUDA_LIBRARY_FIND(cuMemAllocPitch); - CUDA_LIBRARY_FIND(cuMemFree); - CUDA_LIBRARY_FIND(cuMemGetAddressRange); - CUDA_LIBRARY_FIND(cuMemAllocHost); - CUDA_LIBRARY_FIND(cuMemFreeHost); - CUDA_LIBRARY_FIND(cuMemHostAlloc); - CUDA_LIBRARY_FIND(cuMemHostGetDevicePointer); - CUDA_LIBRARY_FIND(cuMemcpyHtoD); - CUDA_LIBRARY_FIND(cuMemcpyDtoH); - CUDA_LIBRARY_FIND(cuMemcpyDtoD); - CUDA_LIBRARY_FIND(cuMemcpyDtoA); - CUDA_LIBRARY_FIND(cuMemcpyAtoD); - CUDA_LIBRARY_FIND(cuMemcpyHtoA); - CUDA_LIBRARY_FIND(cuMemcpyAtoH); - CUDA_LIBRARY_FIND(cuMemcpyAtoA); - CUDA_LIBRARY_FIND(cuMemcpy2D); - CUDA_LIBRARY_FIND(cuMemcpy2DUnaligned); - CUDA_LIBRARY_FIND(cuMemcpy3D); - CUDA_LIBRARY_FIND(cuMemcpyHtoDAsync); - CUDA_LIBRARY_FIND(cuMemcpyDtoHAsync); - CUDA_LIBRARY_FIND(cuMemcpyHtoAAsync); - CUDA_LIBRARY_FIND(cuMemcpyAtoHAsync); - CUDA_LIBRARY_FIND(cuMemcpy2DAsync); - CUDA_LIBRARY_FIND(cuMemcpy3DAsync); - CUDA_LIBRARY_FIND(cuMemsetD8); - CUDA_LIBRARY_FIND(cuMemsetD16); - CUDA_LIBRARY_FIND(cuMemsetD32); - CUDA_LIBRARY_FIND(cuMemsetD2D8); - CUDA_LIBRARY_FIND(cuMemsetD2D16); - CUDA_LIBRARY_FIND(cuMemsetD2D32); - CUDA_LIBRARY_FIND(cuFuncSetBlockShape); - CUDA_LIBRARY_FIND(cuFuncSetSharedSize); - CUDA_LIBRARY_FIND(cuFuncGetAttribute); - CUDA_LIBRARY_FIND(cuArrayCreate); - CUDA_LIBRARY_FIND(cuArrayGetDescriptor); - CUDA_LIBRARY_FIND(cuArrayDestroy); - CUDA_LIBRARY_FIND(cuArray3DCreate); - CUDA_LIBRARY_FIND(cuArray3DGetDescriptor); - CUDA_LIBRARY_FIND(cuTexRefCreate); - CUDA_LIBRARY_FIND(cuTexRefDestroy); - CUDA_LIBRARY_FIND(cuTexRefSetArray); - CUDA_LIBRARY_FIND(cuTexRefSetAddress); - CUDA_LIBRARY_FIND(cuTexRefSetAddress2D); - CUDA_LIBRARY_FIND(cuTexRefSetFormat); - CUDA_LIBRARY_FIND(cuTexRefSetAddressMode); - CUDA_LIBRARY_FIND(cuTexRefSetFilterMode); - CUDA_LIBRARY_FIND(cuTexRefSetFlags); - CUDA_LIBRARY_FIND(cuTexRefGetAddress); - CUDA_LIBRARY_FIND(cuTexRefGetArray); - CUDA_LIBRARY_FIND(cuTexRefGetAddressMode); - CUDA_LIBRARY_FIND(cuTexRefGetFilterMode); - CUDA_LIBRARY_FIND(cuTexRefGetFormat); - CUDA_LIBRARY_FIND(cuTexRefGetFlags); - CUDA_LIBRARY_FIND(cuParamSetSize); - CUDA_LIBRARY_FIND(cuParamSeti); - CUDA_LIBRARY_FIND(cuParamSetf); - CUDA_LIBRARY_FIND(cuParamSetv); - CUDA_LIBRARY_FIND(cuParamSetTexRef); - CUDA_LIBRARY_FIND(cuLaunch); - CUDA_LIBRARY_FIND(cuLaunchGrid); - CUDA_LIBRARY_FIND(cuLaunchGridAsync); - CUDA_LIBRARY_FIND(cuEventCreate); - CUDA_LIBRARY_FIND(cuEventRecord); - CUDA_LIBRARY_FIND(cuEventQuery); - CUDA_LIBRARY_FIND(cuEventSynchronize); - CUDA_LIBRARY_FIND(cuEventDestroy); - CUDA_LIBRARY_FIND(cuEventElapsedTime); - CUDA_LIBRARY_FIND(cuStreamCreate); - CUDA_LIBRARY_FIND(cuStreamQuery); - CUDA_LIBRARY_FIND(cuStreamSynchronize); - CUDA_LIBRARY_FIND(cuStreamDestroy); - - /* cuda 2.1 */ - CUDA_LIBRARY_FIND(cuModuleLoadDataEx); - CUDA_LIBRARY_FIND(cuModuleLoadFatBinary); - CUDA_LIBRARY_FIND(cuGLCtxCreate); - CUDA_LIBRARY_FIND(cuGraphicsGLRegisterBuffer); - CUDA_LIBRARY_FIND(cuGraphicsGLRegisterImage); - - /* cuda 2.3 */ - CUDA_LIBRARY_FIND(cuMemHostGetFlags); - CUDA_LIBRARY_FIND(cuGraphicsGLRegisterBuffer); - CUDA_LIBRARY_FIND(cuGraphicsGLRegisterImage); - - /* cuda 3.0 */ - CUDA_LIBRARY_FIND(cuMemcpyDtoDAsync); - CUDA_LIBRARY_FIND(cuFuncSetCacheConfig); - CUDA_LIBRARY_FIND(cuGraphicsUnregisterResource); - CUDA_LIBRARY_FIND(cuGraphicsSubResourceGetMappedArray); - CUDA_LIBRARY_FIND(cuGraphicsResourceGetMappedPointer); - CUDA_LIBRARY_FIND(cuGraphicsResourceSetMapFlags); - CUDA_LIBRARY_FIND(cuGraphicsMapResources); - CUDA_LIBRARY_FIND(cuGraphicsUnmapResources); - CUDA_LIBRARY_FIND(cuGetExportTable); - - /* cuda 3.1 */ - CUDA_LIBRARY_FIND(cuModuleGetSurfRef); - CUDA_LIBRARY_FIND(cuSurfRefSetArray); - CUDA_LIBRARY_FIND(cuSurfRefGetArray); - CUDA_LIBRARY_FIND(cuCtxSetLimit); - CUDA_LIBRARY_FIND(cuCtxGetLimit); - - /* functions which changed 3.1 -> 3.2 for 64 bit stuff, the cuda library - * has both the old ones for compatibility and new ones with _v2 postfix, - * we load the _v2 ones here. */ - CUDA_LIBRARY_FIND_V2(cuDeviceTotalMem); - CUDA_LIBRARY_FIND_V2(cuCtxCreate); - CUDA_LIBRARY_FIND_V2(cuModuleGetGlobal); - CUDA_LIBRARY_FIND_V2(cuMemGetInfo); - CUDA_LIBRARY_FIND_V2(cuMemAlloc); - CUDA_LIBRARY_FIND_V2(cuMemAllocPitch); - CUDA_LIBRARY_FIND_V2(cuMemFree); - CUDA_LIBRARY_FIND_V2(cuMemGetAddressRange); - CUDA_LIBRARY_FIND_V2(cuMemAllocHost); - CUDA_LIBRARY_FIND_V2(cuMemHostGetDevicePointer); - CUDA_LIBRARY_FIND_V2(cuMemcpyHtoD); - CUDA_LIBRARY_FIND_V2(cuMemcpyDtoH); - CUDA_LIBRARY_FIND_V2(cuMemcpyDtoD); - CUDA_LIBRARY_FIND_V2(cuMemcpyDtoA); - CUDA_LIBRARY_FIND_V2(cuMemcpyAtoD); - CUDA_LIBRARY_FIND_V2(cuMemcpyHtoA); - CUDA_LIBRARY_FIND_V2(cuMemcpyAtoH); - CUDA_LIBRARY_FIND_V2(cuMemcpyAtoA); - CUDA_LIBRARY_FIND_V2(cuMemcpyHtoAAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpyAtoHAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpy2D); - CUDA_LIBRARY_FIND_V2(cuMemcpy2DUnaligned); - CUDA_LIBRARY_FIND_V2(cuMemcpy3D); - CUDA_LIBRARY_FIND_V2(cuMemcpyHtoDAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpyDtoHAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpyDtoDAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpy2DAsync); - CUDA_LIBRARY_FIND_V2(cuMemcpy3DAsync); - CUDA_LIBRARY_FIND_V2(cuMemsetD8); - CUDA_LIBRARY_FIND_V2(cuMemsetD16); - CUDA_LIBRARY_FIND_V2(cuMemsetD32); - CUDA_LIBRARY_FIND_V2(cuMemsetD2D8); - CUDA_LIBRARY_FIND_V2(cuMemsetD2D16); - CUDA_LIBRARY_FIND_V2(cuMemsetD2D32); - CUDA_LIBRARY_FIND_V2(cuArrayCreate); - CUDA_LIBRARY_FIND_V2(cuArrayGetDescriptor); - CUDA_LIBRARY_FIND_V2(cuArray3DCreate); - CUDA_LIBRARY_FIND_V2(cuArray3DGetDescriptor); - CUDA_LIBRARY_FIND_V2(cuTexRefSetAddress); - CUDA_LIBRARY_FIND_V2(cuTexRefSetAddress2D); - CUDA_LIBRARY_FIND_V2(cuTexRefGetAddress); - CUDA_LIBRARY_FIND_V2(cuGraphicsResourceGetMappedPointer); - CUDA_LIBRARY_FIND_V2(cuGLCtxCreate); - - /* cuda 4.0 */ - CUDA_LIBRARY_FIND(cuCtxSetCurrent); - - if(cuHavePrecompiledKernels()) - result = true; -#ifndef _WIN32 - else if(cuCompilerPath() != "") - result = true; -#endif - - return result; -} - -bool cuHavePrecompiledKernels() -{ - string cubins_path = path_get("lib"); - - return path_exists(cubins_path); -} - -string cuCompilerPath() -{ -#ifdef _WIN32 - const char *defaultpaths[] = {"C:/CUDA/bin", NULL}; - const char *executable = "nvcc.exe"; -#else - const char *defaultpaths[] = { - "/Developer/NVIDIA/CUDA-5.0/bin", - "/usr/local/cuda-5.0/bin", - "/usr/local/cuda/bin", - "/Developer/NVIDIA/CUDA-6.0/bin", - "/usr/local/cuda-6.0/bin", - "/Developer/NVIDIA/CUDA-5.5/bin", - "/usr/local/cuda-5.5/bin", - NULL}; - const char *executable = "nvcc"; -#endif - - const char *binpath = getenv("CUDA_BIN_PATH"); - - string nvcc; - - if(binpath) { - nvcc = path_join(binpath, executable); - if(path_exists(nvcc)) - return nvcc; - } - - for(int i = 0; defaultpaths[i]; i++) { - nvcc = path_join(defaultpaths[i], executable); - if(path_exists(nvcc)) - return nvcc; - } - -#ifndef _WIN32 - { - FILE *handle = popen("which nvcc", "r"); - if(handle) { - char buffer[4096] = {0}; - int len = fread(buffer, 1, sizeof(buffer) - 1, handle); - buffer[len] = '\0'; - pclose(handle); - - if(buffer[0]) - return "nvcc"; - } - } -#endif - - return ""; -} - -int cuCompilerVersion() -{ - string path = cuCompilerPath(); - if(path == "") - return 0; - - /* get --version output */ - FILE *pipe = popen((path + " --version").c_str(), "r"); - if(!pipe) { - fprintf(stderr, "CUDA: failed to run compiler to retrieve version"); - return 0; - } - - char buf[128]; - string output = ""; - - while(!feof(pipe)) - if(fgets(buf, 128, pipe) != NULL) - output += buf; - - pclose(pipe); - - /* parse version number */ - string marker = "Cuda compilation tools, release "; - size_t offset = output.find(marker); - if(offset == string::npos) { - fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output.c_str()); - return 0; - } - - string versionstr = output.substr(offset + marker.size(), string::npos); - int major, minor; - - if(sscanf(versionstr.c_str(), "%d.%d", &major, &minor) < 2) { - fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output.c_str()); - return 0; - } - - return 10*major + minor; -} - -CCL_NAMESPACE_END - diff --git a/intern/cycles/util/util_cuda.h b/intern/cycles/util/util_cuda.h deleted file mode 100644 index 0c80303df9b..00000000000 --- a/intern/cycles/util/util_cuda.h +++ /dev/null @@ -1,624 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License - */ - -#ifndef __UTIL_CUDA_H__ -#define __UTIL_CUDA_H__ - -#include <stdlib.h> -#include "util_opengl.h" -#include "util_string.h" - -CCL_NAMESPACE_BEGIN - -/* CUDA is linked in dynamically at runtime, so we can start the application - * without requiring a CUDA installation. Code adapted from the example - * matrixMulDynlinkJIT in the CUDA SDK. */ - -bool cuLibraryInit(); -bool cuHavePrecompiledKernels(); -string cuCompilerPath(); -int cuCompilerVersion(); - -CCL_NAMESPACE_END - -/* defines, structs, enums */ - -#define CUDA_VERSION 3020 - -#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__LP64__) -typedef unsigned long long CUdeviceptr; -#else -typedef unsigned int CUdeviceptr; -#endif - -typedef int CUdevice; -typedef struct CUctx_st *CUcontext; -typedef struct CUmod_st *CUmodule; -typedef struct CUfunc_st *CUfunction; -typedef struct CUarray_st *CUarray; -typedef struct CUtexref_st *CUtexref; -typedef struct CUsurfref_st *CUsurfref; -typedef struct CUevent_st *CUevent; -typedef struct CUstream_st *CUstream; -typedef struct CUgraphicsResource_st *CUgraphicsResource; - -typedef struct CUuuid_st { - char bytes[16]; -} CUuuid; - -typedef enum CUctx_flags_enum { - CU_CTX_SCHED_AUTO = 0, - CU_CTX_SCHED_SPIN = 1, - CU_CTX_SCHED_YIELD = 2, - CU_CTX_SCHED_MASK = 0x3, - CU_CTX_BLOCKING_SYNC = 4, - CU_CTX_MAP_HOST = 8, - CU_CTX_LMEM_RESIZE_TO_MAX = 16, - CU_CTX_FLAGS_MASK = 0x1f -} CUctx_flags; - -typedef enum CUevent_flags_enum { - CU_EVENT_DEFAULT = 0, - CU_EVENT_BLOCKING_SYNC = 1, - CU_EVENT_DISABLE_TIMING = 2 -} CUevent_flags; - -typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 -} CUarray_format; - -typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, - CU_TR_ADDRESS_MODE_CLAMP = 1, - CU_TR_ADDRESS_MODE_MIRROR = 2, - CU_TR_ADDRESS_MODE_BORDER = 3 -} CUaddress_mode; - -typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 -} CUfilter_mode; - -typedef enum CUdevice_attribute_enum { - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35 -} CUdevice_attribute; - -typedef struct CUdevprop_st { - int maxThreadsPerBlock; - int maxThreadsDim[3]; - int maxGridSize[3]; - int sharedMemPerBlock; - int totalConstantMemory; - int SIMDWidth; - int memPitch; - int regsPerBlock; - int clockRate; - int textureAlign; -} CUdevprop; - -typedef enum CUfunction_attribute_enum { - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, - CU_FUNC_ATTRIBUTE_NUM_REGS = 4, - CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, - CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, - CU_FUNC_ATTRIBUTE_MAX -} CUfunction_attribute; - -typedef enum CUfunc_cache_enum { - CU_FUNC_CACHE_PREFER_NONE = 0x00, - CU_FUNC_CACHE_PREFER_SHARED = 0x01, - CU_FUNC_CACHE_PREFER_L1 = 0x02 -} CUfunc_cache; - -typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03 -} CUmemorytype; - -typedef enum CUcomputemode_enum { - CU_COMPUTEMODE_DEFAULT = 0, - CU_COMPUTEMODE_EXCLUSIVE = 1, - CU_COMPUTEMODE_PROHIBITED = 2 -} CUcomputemode; - -typedef enum CUjit_option_enum -{ - CU_JIT_MAX_REGISTERS = 0, - CU_JIT_THREADS_PER_BLOCK, - CU_JIT_WALL_TIME, - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - CU_JIT_OPTIMIZATION_LEVEL, - CU_JIT_TARGET_FROM_CUCONTEXT, - CU_JIT_TARGET, - CU_JIT_FALLBACK_STRATEGY - -} CUjit_option; - -typedef enum CUjit_target_enum -{ - CU_TARGET_COMPUTE_10 = 0, - CU_TARGET_COMPUTE_11, - CU_TARGET_COMPUTE_12, - CU_TARGET_COMPUTE_13, - CU_TARGET_COMPUTE_20, - CU_TARGET_COMPUTE_21, - CU_TARGET_COMPUTE_30, - CU_TARGET_COMPUTE_35, - CU_TARGET_COMPUTE_50 -} CUjit_target; - -typedef enum CUjit_fallback_enum -{ - CU_PREFER_PTX = 0, - CU_PREFER_BINARY - -} CUjit_fallback; - -typedef enum CUgraphicsRegisterFlags_enum { - CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00 -} CUgraphicsRegisterFlags; - -typedef enum CUgraphicsMapResourceFlags_enum { - CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 -} CUgraphicsMapResourceFlags; - -typedef enum CUarray_cubemap_face_enum { - CU_CUBEMAP_FACE_POSITIVE_X = 0x00, - CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, - CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, - CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, - CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, - CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 -} CUarray_cubemap_face; - -typedef enum CUlimit_enum { - CU_LIMIT_STACK_SIZE = 0x00, - CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, - CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 -} CUlimit; - -typedef enum cudaError_enum { - CUDA_SUCCESS = 0, - CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_OUT_OF_MEMORY = 2, - CUDA_ERROR_NOT_INITIALIZED = 3, - CUDA_ERROR_DEINITIALIZED = 4, - CUDA_ERROR_NO_DEVICE = 100, - CUDA_ERROR_INVALID_DEVICE = 101, - CUDA_ERROR_INVALID_IMAGE = 200, - CUDA_ERROR_INVALID_CONTEXT = 201, - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, - CUDA_ERROR_MAP_FAILED = 205, - CUDA_ERROR_UNMAP_FAILED = 206, - CUDA_ERROR_ARRAY_IS_MAPPED = 207, - CUDA_ERROR_ALREADY_MAPPED = 208, - CUDA_ERROR_NO_BINARY_FOR_GPU = 209, - CUDA_ERROR_ALREADY_ACQUIRED = 210, - CUDA_ERROR_NOT_MAPPED = 211, - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, - CUDA_ERROR_ECC_UNCORRECTABLE = 214, - CUDA_ERROR_UNSUPPORTED_LIMIT = 215, - CUDA_ERROR_INVALID_SOURCE = 300, - CUDA_ERROR_FILE_NOT_FOUND = 301, - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, - CUDA_ERROR_OPERATING_SYSTEM = 304, - CUDA_ERROR_INVALID_HANDLE = 400, - CUDA_ERROR_NOT_FOUND = 500, - CUDA_ERROR_NOT_READY = 600, - CUDA_ERROR_LAUNCH_FAILED = 700, - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, - CUDA_ERROR_LAUNCH_TIMEOUT = 702, - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, - CUDA_ERROR_UNKNOWN = 999 -} CUresult; - -#define CU_MEMHOSTALLOC_PORTABLE 0x01 -#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 -#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 - -typedef struct CUDA_MEMCPY2D_st { - size_t srcXInBytes; - size_t srcY; - - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - size_t srcPitch; - - size_t dstXInBytes; - size_t dstY; - - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - size_t dstPitch; - - size_t WidthInBytes; - size_t Height; -} CUDA_MEMCPY2D; - -typedef struct CUDA_MEMCPY3D_st { - size_t srcXInBytes; - size_t srcY; - size_t srcZ; - size_t srcLOD; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - void *reserved0; - size_t srcPitch; - size_t srcHeight; - - size_t dstXInBytes; - size_t dstY; - size_t dstZ; - size_t dstLOD; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - void *reserved1; - size_t dstPitch; - size_t dstHeight; - - size_t WidthInBytes; - size_t Height; - size_t Depth; -} CUDA_MEMCPY3D; - -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - size_t Width; - size_t Height; - - CUarray_format Format; - unsigned int NumChannels; -} CUDA_ARRAY_DESCRIPTOR; - -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - size_t Width; - size_t Height; - size_t Depth; - - CUarray_format Format; - unsigned int NumChannels; - unsigned int Flags; -} CUDA_ARRAY3D_DESCRIPTOR; - -#define CUDA_ARRAY3D_2DARRAY 0x01 -#define CUDA_ARRAY3D_SURFACE_LDST 0x02 -#define CU_TRSA_OVERRIDE_FORMAT 0x01 -#define CU_TRSF_READ_AS_INTEGER 0x01 -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 -#define CU_TRSF_SRGB 0x10 -#define CU_PARAM_TR_DEFAULT -1 - -#ifdef _WIN32 -#define CUDAAPI __stdcall -#else -#define CUDAAPI -#endif - -/* function types */ - -typedef CUresult CUDAAPI tcuInit(unsigned int Flags); -typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion); -typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal); -typedef CUresult CUDAAPI tcuDeviceGetCount(int *count); -typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev); -typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); -typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); -typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags); -typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx); -typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx ); -typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx); -typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device); -typedef CUresult CUDAAPI tcuCtxSynchronize(void); -typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value); -typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit); -typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig); -typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config); -typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version); -typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname); -typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image); -typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); -typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); -typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod); -typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); -typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); -typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); -typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); -typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total); -typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize); -typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); -typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr); -typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); -typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize); -typedef CUresult CUDAAPI tcuMemFreeHost(void *p); -typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); -typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); -typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p); -typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); -typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy); -typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); -typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy); -typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); -typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); -typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); -typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); -typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); -typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); -typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); -typedef CUresult CUDAAPI tcuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); -typedef CUresult CUDAAPI tcuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); -typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); -typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray); -typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); -typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags); -typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); -typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream); -typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream); -typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream); -typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags); -typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream); -typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent); -typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); -typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); -typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); -typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); -typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); -typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes); -typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value); -typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value); -typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); -typedef CUresult CUDAAPI tcuLaunch(CUfunction f); -typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height); -typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); -typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); -typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); -typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); -typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); -typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); -typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); -typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); -typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); -typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); -typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef); -typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef); -typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); -typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); -typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource); -typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); -typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); -typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); -typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); -typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); -typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); -typedef CUresult CUDAAPI tcuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device ); -typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags); -typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags); -typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx); - -/* function declarations */ - -extern tcuInit *cuInit; -extern tcuDriverGetVersion *cuDriverGetVersion; -extern tcuDeviceGet *cuDeviceGet; -extern tcuDeviceGetCount *cuDeviceGetCount; -extern tcuDeviceGetName *cuDeviceGetName; -extern tcuDeviceComputeCapability *cuDeviceComputeCapability; -extern tcuDeviceTotalMem *cuDeviceTotalMem; -extern tcuDeviceGetProperties *cuDeviceGetProperties; -extern tcuDeviceGetAttribute *cuDeviceGetAttribute; -extern tcuCtxCreate *cuCtxCreate; -extern tcuCtxDestroy *cuCtxDestroy; -extern tcuCtxAttach *cuCtxAttach; -extern tcuCtxDetach *cuCtxDetach; -extern tcuCtxPushCurrent *cuCtxPushCurrent; -extern tcuCtxPopCurrent *cuCtxPopCurrent; -extern tcuCtxGetDevice *cuCtxGetDevice; -extern tcuCtxSynchronize *cuCtxSynchronize; -extern tcuModuleLoad *cuModuleLoad; -extern tcuModuleLoadData *cuModuleLoadData; -extern tcuModuleLoadDataEx *cuModuleLoadDataEx; -extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary; -extern tcuModuleUnload *cuModuleUnload; -extern tcuModuleGetFunction *cuModuleGetFunction; -extern tcuModuleGetGlobal *cuModuleGetGlobal; -extern tcuModuleGetTexRef *cuModuleGetTexRef; -extern tcuModuleGetSurfRef *cuModuleGetSurfRef; -extern tcuMemGetInfo *cuMemGetInfo; -extern tcuMemAlloc *cuMemAlloc; -extern tcuMemAllocPitch *cuMemAllocPitch; -extern tcuMemFree *cuMemFree; -extern tcuMemGetAddressRange *cuMemGetAddressRange; -extern tcuMemAllocHost *cuMemAllocHost; -extern tcuMemFreeHost *cuMemFreeHost; -extern tcuMemHostAlloc *cuMemHostAlloc; -extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; -extern tcuMemHostGetFlags *cuMemHostGetFlags; -extern tcuMemcpyHtoD *cuMemcpyHtoD; -extern tcuMemcpyDtoH *cuMemcpyDtoH; -extern tcuMemcpyDtoD *cuMemcpyDtoD; -extern tcuMemcpyDtoA *cuMemcpyDtoA; -extern tcuMemcpyAtoD *cuMemcpyAtoD; -extern tcuMemcpyHtoA *cuMemcpyHtoA; -extern tcuMemcpyAtoH *cuMemcpyAtoH; -extern tcuMemcpyAtoA *cuMemcpyAtoA; -extern tcuMemcpy2D *cuMemcpy2D; -extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned; -extern tcuMemcpy3D *cuMemcpy3D; -extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync; -extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync; -extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync; -extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync; -extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync; -extern tcuMemcpy2DAsync *cuMemcpy2DAsync; -extern tcuMemcpy3DAsync *cuMemcpy3DAsync; -extern tcuMemsetD8 *cuMemsetD8; -extern tcuMemsetD16 *cuMemsetD16; -extern tcuMemsetD32 *cuMemsetD32; -extern tcuMemsetD2D8 *cuMemsetD2D8; -extern tcuMemsetD2D16 *cuMemsetD2D16; -extern tcuMemsetD2D32 *cuMemsetD2D32; -extern tcuFuncSetBlockShape *cuFuncSetBlockShape; -extern tcuFuncSetSharedSize *cuFuncSetSharedSize; -extern tcuFuncGetAttribute *cuFuncGetAttribute; -extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig; -extern tcuArrayCreate *cuArrayCreate; -extern tcuArrayGetDescriptor *cuArrayGetDescriptor; -extern tcuArrayDestroy *cuArrayDestroy; -extern tcuArray3DCreate *cuArray3DCreate; -extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor; -extern tcuTexRefCreate *cuTexRefCreate; -extern tcuTexRefDestroy *cuTexRefDestroy; -extern tcuTexRefSetArray *cuTexRefSetArray; -extern tcuTexRefSetAddress *cuTexRefSetAddress; -extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D; -extern tcuTexRefSetFormat *cuTexRefSetFormat; -extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode; -extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode; -extern tcuTexRefSetFlags *cuTexRefSetFlags; -extern tcuTexRefGetAddress *cuTexRefGetAddress; -extern tcuTexRefGetArray *cuTexRefGetArray; -extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode; -extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode; -extern tcuTexRefGetFormat *cuTexRefGetFormat; -extern tcuTexRefGetFlags *cuTexRefGetFlags; -extern tcuSurfRefSetArray *cuSurfRefSetArray; -extern tcuSurfRefGetArray *cuSurfRefGetArray; -extern tcuParamSetSize *cuParamSetSize; -extern tcuParamSeti *cuParamSeti; -extern tcuParamSetf *cuParamSetf; -extern tcuParamSetv *cuParamSetv; -extern tcuParamSetTexRef *cuParamSetTexRef; -extern tcuLaunch *cuLaunch; -extern tcuLaunchGrid *cuLaunchGrid; -extern tcuLaunchGridAsync *cuLaunchGridAsync; -extern tcuEventCreate *cuEventCreate; -extern tcuEventRecord *cuEventRecord; -extern tcuEventQuery *cuEventQuery; -extern tcuEventSynchronize *cuEventSynchronize; -extern tcuEventDestroy *cuEventDestroy; -extern tcuEventElapsedTime *cuEventElapsedTime; -extern tcuStreamCreate *cuStreamCreate; -extern tcuStreamQuery *cuStreamQuery; -extern tcuStreamSynchronize *cuStreamSynchronize; -extern tcuStreamDestroy *cuStreamDestroy; -extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; -extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; -extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer; -extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags; -extern tcuGraphicsMapResources *cuGraphicsMapResources; -extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources; -extern tcuGetExportTable *cuGetExportTable; -extern tcuCtxSetLimit *cuCtxSetLimit; -extern tcuCtxGetLimit *cuCtxGetLimit; -extern tcuGLCtxCreate *cuGLCtxCreate; -extern tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; -extern tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage; -extern tcuCtxSetCurrent *cuCtxSetCurrent; - -#endif /* __UTIL_CUDA_H__ */ - diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index da6fae79bb9..397133618be 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -68,18 +68,18 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) } #else /* same as above with SSE */ - const __m128 mm_scale = _mm_set_ps1(scale); - const __m128i mm_38800000 = _mm_set1_epi32(0x38800000); - const __m128i mm_7FFF = _mm_set1_epi32(0x7FFF); - const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF); - const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000); - - __m128 mm_fscale = _mm_mul_ps(load_m128(f), mm_scale); - __m128i x = _mm_castps_si128(_mm_min_ps(_mm_max_ps(mm_fscale, _mm_set_ps1(0.0f)), _mm_set_ps1(65500.0f))); - __m128i absolute = _mm_and_si128(x, mm_7FFFFFFF); - __m128i Z = _mm_add_epi32(absolute, mm_C8000000); - __m128i result = _mm_andnot_si128(_mm_cmplt_epi32(absolute, mm_38800000), Z); - __m128i rh = _mm_and_si128(_mm_srai_epi32(result, 13), mm_7FFF); + const ssef mm_scale = ssef(scale); + const ssei mm_38800000 = ssei(0x38800000); + const ssei mm_7FFF = ssei(0x7FFF); + const ssei mm_7FFFFFFF = ssei(0x7FFFFFFF); + const ssei mm_C8000000 = ssei(0xC8000000); + + ssef mm_fscale = load4f(f) * mm_scale; + ssei x = cast(min(max(mm_fscale, ssef(0.0f)), ssef(65500.0f))); + ssei absolute = x & mm_7FFFFFFF; + ssei Z = absolute + mm_C8000000; + ssei result = andnot(absolute < mm_38800000, Z); + ssei rh = (result >> 13) & mm_7FFF; _mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh))); #endif diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp new file mode 100644 index 00000000000..0722f16cf45 --- /dev/null +++ b/intern/cycles/util/util_logging.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#include <util_logging.h> + +#include "util_math.h" + +CCL_NAMESPACE_BEGIN + +std::ostream& operator <<(std::ostream &os, + const float3 &value) +{ + os << "(" << value.x + << ", " << value.y + << ", " << value.z + << ")"; + return os; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h new file mode 100644 index 00000000000..991789e7460 --- /dev/null +++ b/intern/cycles/util/util_logging.h @@ -0,0 +1,53 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_LOGGING_H__ +#define __UTIL_LOGGING_H__ + +#if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__) +# include <glog/logging.h> +#else +# include <iostream> +#endif + +CCL_NAMESPACE_BEGIN + +#if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__) +class StubStream : public std::ostream { + public: + StubStream() : std::ostream(NULL) { } +}; + +class LogMessageVoidify { +public: + LogMessageVoidify() { } + void operator&(::std::ostream&) { } +}; + +# define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream() +# define LOG(severity) LOG_SUPPRESS() +# define VLOG(severity) LOG_SUPPRESS() + +#endif + +class float3; + +std::ostream& operator <<(std::ostream &os, + const float3 &value); + +CCL_NAMESPACE_END + +#endif /* __UTIL_LOGGING_H__ */ diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index ded75762cd2..c332e1709db 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -76,17 +76,6 @@ CCL_NAMESPACE_BEGIN #ifdef _WIN32 -#ifndef __KERNEL_GPU__ - -#if defined(_MSC_VER) && (_MSC_VER < 1800) -# define copysignf(x, y) ((float)_copysign(x, y)) -# define hypotf(x, y) _hypotf(x, y) -# define isnan(x) _isnan(x) -# define isfinite(x) _finite(x) -#endif - -#endif - #ifndef __KERNEL_OPENCL__ ccl_device_inline float fmaxf(float a, float b) @@ -622,11 +611,7 @@ ccl_device_inline bool is_zero(const float3 a) ccl_device_inline float reduce_add(const float3 a) { -#ifdef __KERNEL_SSE__ return (a.x + a.y + a.z); -#else - return (a.x + a.y + a.z); -#endif } ccl_device_inline float average(const float3 a) @@ -857,7 +842,6 @@ ccl_device_inline float4 max(float4 a, float4 b) ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* blendv is sse4, and apparently broken on vs2008 */ return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */ #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); @@ -1429,6 +1413,27 @@ ccl_device bool ray_quad_intersect( return false; } +/* projections */ +ccl_device bool map_to_sphere(float *r_u, float *r_v, + const float x, const float y, const float z) +{ + float len = sqrtf(x * x + y * y + z * z); + if(len > 0.0f) { + if(UNLIKELY(x == 0.0f && y == 0.0f)) { + *r_u = 0.0f; /* othwise domain error */ + } + else { + *r_u = (1.0f - atan2f(x, y) / M_PI_F) / 2.0f; + } + *r_v = 1.0f - safe_acosf(z / len) / M_PI_F; + return true; + } + else { + *r_v = *r_u = 0.0f; /* to avoid un-initialized variables */ + return false; + } +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ diff --git a/intern/cycles/util/util_opencl.cpp b/intern/cycles/util/util_opencl.cpp deleted file mode 100644 index c2d6bc66dc1..00000000000 --- a/intern/cycles/util/util_opencl.cpp +++ /dev/null @@ -1,337 +0,0 @@ -////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009 Organic Vectory B.V. -// Written by George van Venrooij -// -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file doc/license/Boost.txt) -// Extracted from the CLCC project - http://clcc.sourceforge.net/ -////////////////////////////////////////////////////////////////////////// - -#include <stdlib.h> - -#include "util_opencl.h" - -#ifndef CLCC_GENERATE_DOCUMENTATION -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# define VC_EXTRALEAN -# include <windows.h> - - typedef HMODULE CLCC_DYNLIB_HANDLE; - -# define CLCC_DYNLIB_OPEN LoadLibrary -# define CLCC_DYNLIB_CLOSE FreeLibrary -# define CLCC_DYNLIB_IMPORT GetProcAddress -#else -# include <dlfcn.h> - - typedef void* CLCC_DYNLIB_HANDLE; - -# define CLCC_DYNLIB_OPEN(path) dlopen(path, RTLD_NOW | RTLD_GLOBAL) -# define CLCC_DYNLIB_CLOSE dlclose -# define CLCC_DYNLIB_IMPORT dlsym -#endif -#else -// typedef implementation_defined CLCC_DYNLIB_HANDLE; -//# define CLCC_DYNLIB_OPEN(path) implementation_defined -//# define CLCC_DYNLIB_CLOSE implementation_defined -//# define CLCC_DYNLIB_IMPORT implementation_defined -#endif - -CCL_NAMESPACE_BEGIN - -//! \brief module handle -static CLCC_DYNLIB_HANDLE module = NULL; - -// Variables holding function entry points -#ifndef CLCC_GENERATE_DOCUMENTATION -PFNCLGETPLATFORMIDS __clewGetPlatformIDs = NULL; -PFNCLGETPLATFORMINFO __clewGetPlatformInfo = NULL; -PFNCLGETDEVICEIDS __clewGetDeviceIDs = NULL; -PFNCLGETDEVICEINFO __clewGetDeviceInfo = NULL; -PFNCLCREATECONTEXT __clewCreateContext = NULL; -PFNCLCREATECONTEXTFROMTYPE __clewCreateContextFromType = NULL; -PFNCLRETAINCONTEXT __clewRetainContext = NULL; -PFNCLRELEASECONTEXT __clewReleaseContext = NULL; -PFNCLGETCONTEXTINFO __clewGetContextInfo = NULL; -PFNCLCREATECOMMANDQUEUE __clewCreateCommandQueue = NULL; -PFNCLRETAINCOMMANDQUEUE __clewRetainCommandQueue = NULL; -PFNCLRELEASECOMMANDQUEUE __clewReleaseCommandQueue = NULL; -PFNCLGETCOMMANDQUEUEINFO __clewGetCommandQueueInfo = NULL; -PFNCLSETCOMMANDQUEUEPROPERTY __clewSetCommandQueueProperty = NULL; -PFNCLCREATEBUFFER __clewCreateBuffer = NULL; -PFNCLCREATEIMAGE2D __clewCreateImage2D = NULL; -PFNCLCREATEIMAGE3D __clewCreateImage3D = NULL; -PFNCLRETAINMEMOBJECT __clewRetainMemObject = NULL; -PFNCLRELEASEMEMOBJECT __clewReleaseMemObject = NULL; -PFNCLGETSUPPORTEDIMAGEFORMATS __clewGetSupportedImageFormats = NULL; -PFNCLGETMEMOBJECTINFO __clewGetMemObjectInfo = NULL; -PFNCLGETIMAGEINFO __clewGetImageInfo = NULL; -PFNCLCREATESAMPLER __clewCreateSampler = NULL; -PFNCLRETAINSAMPLER __clewRetainSampler = NULL; -PFNCLRELEASESAMPLER __clewReleaseSampler = NULL; -PFNCLGETSAMPLERINFO __clewGetSamplerInfo = NULL; -PFNCLCREATEPROGRAMWITHSOURCE __clewCreateProgramWithSource = NULL; -PFNCLCREATEPROGRAMWITHBINARY __clewCreateProgramWithBinary = NULL; -PFNCLRETAINPROGRAM __clewRetainProgram = NULL; -PFNCLRELEASEPROGRAM __clewReleaseProgram = NULL; -PFNCLBUILDPROGRAM __clewBuildProgram = NULL; -PFNCLUNLOADCOMPILER __clewUnloadCompiler = NULL; -PFNCLGETPROGRAMINFO __clewGetProgramInfo = NULL; -PFNCLGETPROGRAMBUILDINFO __clewGetProgramBuildInfo = NULL; -PFNCLCREATEKERNEL __clewCreateKernel = NULL; -PFNCLCREATEKERNELSINPROGRAM __clewCreateKernelsInProgram = NULL; -PFNCLRETAINKERNEL __clewRetainKernel = NULL; -PFNCLRELEASEKERNEL __clewReleaseKernel = NULL; -PFNCLSETKERNELARG __clewSetKernelArg = NULL; -PFNCLGETKERNELINFO __clewGetKernelInfo = NULL; -PFNCLGETKERNELWORKGROUPINFO __clewGetKernelWorkGroupInfo = NULL; -PFNCLWAITFOREVENTS __clewWaitForEvents = NULL; -PFNCLGETEVENTINFO __clewGetEventInfo = NULL; -PFNCLRETAINEVENT __clewRetainEvent = NULL; -PFNCLRELEASEEVENT __clewReleaseEvent = NULL; -PFNCLGETEVENTPROFILINGINFO __clewGetEventProfilingInfo = NULL; -PFNCLFLUSH __clewFlush = NULL; -PFNCLFINISH __clewFinish = NULL; -PFNCLENQUEUEREADBUFFER __clewEnqueueReadBuffer = NULL; -PFNCLENQUEUEWRITEBUFFER __clewEnqueueWriteBuffer = NULL; -PFNCLENQUEUECOPYBUFFER __clewEnqueueCopyBuffer = NULL; -PFNCLENQUEUEREADIMAGE __clewEnqueueReadImage = NULL; -PFNCLENQUEUEWRITEIMAGE __clewEnqueueWriteImage = NULL; -PFNCLENQUEUECOPYIMAGE __clewEnqueueCopyImage = NULL; -PFNCLENQUEUECOPYIMAGETOBUFFER __clewEnqueueCopyImageToBuffer = NULL; -PFNCLENQUEUECOPYBUFFERTOIMAGE __clewEnqueueCopyBufferToImage = NULL; -PFNCLENQUEUEMAPBUFFER __clewEnqueueMapBuffer = NULL; -PFNCLENQUEUEMAPIMAGE __clewEnqueueMapImage = NULL; -PFNCLENQUEUEUNMAPMEMOBJECT __clewEnqueueUnmapMemObject = NULL; -PFNCLENQUEUENDRANGEKERNEL __clewEnqueueNDRangeKernel = NULL; -PFNCLENQUEUETASK __clewEnqueueTask = NULL; -PFNCLENQUEUENATIVEKERNEL __clewEnqueueNativeKernel = NULL; -PFNCLENQUEUEMARKER __clewEnqueueMarker = NULL; -PFNCLENQUEUEWAITFOREVENTS __clewEnqueueWaitForEvents = NULL; -PFNCLENQUEUEBARRIER __clewEnqueueBarrier = NULL; -PFNCLGETEXTENSIONFUNCTIONADDRESS __clewGetExtensionFunctionAddress = NULL; -#endif // CLCC_GENERATE_DOCUMENTATION - - -#if 0 -//! \brief Unloads OpenCL dynamic library, should not be called directly -static void clewExit(void) -{ - if (module != NULL) - { - // Ignore errors - CLCC_DYNLIB_CLOSE(module); - module = NULL; - } -} -#endif - -//! \param path path to dynamic library to load -//! \return CLEW_ERROR_OPEN_FAILED if the library could not be opened -//! CLEW_ERROR_ATEXIT_FAILED if atexit(clewExit) failed -//! CLEW_SUCCESS when the library was succesfully loaded -int clLibraryInit() -{ -#ifdef _WIN32 - const char *path = "OpenCL.dll"; -#elif defined(__APPLE__) - const char *path = "/Library/Frameworks/OpenCL.framework/OpenCL"; -#else - const char *path = "libOpenCL.so"; -#endif - - // OpenCL disabled for now, only works with this environment variable set - if(!getenv("CYCLES_OPENCL_TEST")) - return 0; - - // Check if already initialized - if (module != NULL) - { - return 1; - } - - // Load library - module = CLCC_DYNLIB_OPEN(path); - - // Check for errors - if (module == NULL) - { - return 0; - } - - // Disabled because we retain OpenCL context and it's difficult to ensure - // this will exit after releasing the context -#if 0 - // Set unloading - int error = atexit(clewExit); - - if (error) - { - // Failure queing atexit, shutdown with error - CLCC_DYNLIB_CLOSE(module); - module = NULL; - - return 0; - } -#endif - - // Determine function entry-points - __clewGetPlatformIDs = (PFNCLGETPLATFORMIDS )CLCC_DYNLIB_IMPORT(module, "clGetPlatformIDs"); - __clewGetPlatformInfo = (PFNCLGETPLATFORMINFO )CLCC_DYNLIB_IMPORT(module, "clGetPlatformInfo"); - __clewGetDeviceIDs = (PFNCLGETDEVICEIDS )CLCC_DYNLIB_IMPORT(module, "clGetDeviceIDs"); - __clewGetDeviceInfo = (PFNCLGETDEVICEINFO )CLCC_DYNLIB_IMPORT(module, "clGetDeviceInfo"); - __clewCreateContext = (PFNCLCREATECONTEXT )CLCC_DYNLIB_IMPORT(module, "clCreateContext"); - __clewCreateContextFromType = (PFNCLCREATECONTEXTFROMTYPE )CLCC_DYNLIB_IMPORT(module, "clCreateContextFromType"); - __clewRetainContext = (PFNCLRETAINCONTEXT )CLCC_DYNLIB_IMPORT(module, "clRetainContext"); - __clewReleaseContext = (PFNCLRELEASECONTEXT )CLCC_DYNLIB_IMPORT(module, "clReleaseContext"); - __clewGetContextInfo = (PFNCLGETCONTEXTINFO )CLCC_DYNLIB_IMPORT(module, "clGetContextInfo"); - __clewCreateCommandQueue = (PFNCLCREATECOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clCreateCommandQueue"); - __clewRetainCommandQueue = (PFNCLRETAINCOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clRetainCommandQueue"); - __clewReleaseCommandQueue = (PFNCLRELEASECOMMANDQUEUE )CLCC_DYNLIB_IMPORT(module, "clReleaseCommandQueue"); - __clewGetCommandQueueInfo = (PFNCLGETCOMMANDQUEUEINFO )CLCC_DYNLIB_IMPORT(module, "clGetCommandQueueInfo"); - __clewSetCommandQueueProperty = (PFNCLSETCOMMANDQUEUEPROPERTY )CLCC_DYNLIB_IMPORT(module, "clSetCommandQueueProperty"); - __clewCreateBuffer = (PFNCLCREATEBUFFER )CLCC_DYNLIB_IMPORT(module, "clCreateBuffer"); - __clewCreateImage2D = (PFNCLCREATEIMAGE2D )CLCC_DYNLIB_IMPORT(module, "clCreateImage2D"); - __clewCreateImage3D = (PFNCLCREATEIMAGE3D )CLCC_DYNLIB_IMPORT(module, "clCreateImage3D"); - __clewRetainMemObject = (PFNCLRETAINMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clRetainMemObject"); - __clewReleaseMemObject = (PFNCLRELEASEMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clReleaseMemObject"); - __clewGetSupportedImageFormats = (PFNCLGETSUPPORTEDIMAGEFORMATS )CLCC_DYNLIB_IMPORT(module, "clGetSupportedImageFormats"); - __clewGetMemObjectInfo = (PFNCLGETMEMOBJECTINFO )CLCC_DYNLIB_IMPORT(module, "clGetMemObjectInfo"); - __clewGetImageInfo = (PFNCLGETIMAGEINFO )CLCC_DYNLIB_IMPORT(module, "clGetImageInfo"); - __clewCreateSampler = (PFNCLCREATESAMPLER )CLCC_DYNLIB_IMPORT(module, "clCreateSampler"); - __clewRetainSampler = (PFNCLRETAINSAMPLER )CLCC_DYNLIB_IMPORT(module, "clRetainSampler"); - __clewReleaseSampler = (PFNCLRELEASESAMPLER )CLCC_DYNLIB_IMPORT(module, "clReleaseSampler"); - __clewGetSamplerInfo = (PFNCLGETSAMPLERINFO )CLCC_DYNLIB_IMPORT(module, "clGetSamplerInfo"); - __clewCreateProgramWithSource = (PFNCLCREATEPROGRAMWITHSOURCE )CLCC_DYNLIB_IMPORT(module, "clCreateProgramWithSource"); - __clewCreateProgramWithBinary = (PFNCLCREATEPROGRAMWITHBINARY )CLCC_DYNLIB_IMPORT(module, "clCreateProgramWithBinary"); - __clewRetainProgram = (PFNCLRETAINPROGRAM )CLCC_DYNLIB_IMPORT(module, "clRetainProgram"); - __clewReleaseProgram = (PFNCLRELEASEPROGRAM )CLCC_DYNLIB_IMPORT(module, "clReleaseProgram"); - __clewBuildProgram = (PFNCLBUILDPROGRAM )CLCC_DYNLIB_IMPORT(module, "clBuildProgram"); - __clewUnloadCompiler = (PFNCLUNLOADCOMPILER )CLCC_DYNLIB_IMPORT(module, "clUnloadCompiler"); - __clewGetProgramInfo = (PFNCLGETPROGRAMINFO )CLCC_DYNLIB_IMPORT(module, "clGetProgramInfo"); - __clewGetProgramBuildInfo = (PFNCLGETPROGRAMBUILDINFO )CLCC_DYNLIB_IMPORT(module, "clGetProgramBuildInfo"); - __clewCreateKernel = (PFNCLCREATEKERNEL )CLCC_DYNLIB_IMPORT(module, "clCreateKernel"); - __clewCreateKernelsInProgram = (PFNCLCREATEKERNELSINPROGRAM )CLCC_DYNLIB_IMPORT(module, "clCreateKernelsInProgram"); - __clewRetainKernel = (PFNCLRETAINKERNEL )CLCC_DYNLIB_IMPORT(module, "clRetainKernel"); - __clewReleaseKernel = (PFNCLRELEASEKERNEL )CLCC_DYNLIB_IMPORT(module, "clReleaseKernel"); - __clewSetKernelArg = (PFNCLSETKERNELARG )CLCC_DYNLIB_IMPORT(module, "clSetKernelArg"); - __clewGetKernelInfo = (PFNCLGETKERNELINFO )CLCC_DYNLIB_IMPORT(module, "clGetKernelInfo"); - __clewGetKernelWorkGroupInfo = (PFNCLGETKERNELWORKGROUPINFO )CLCC_DYNLIB_IMPORT(module, "clGetKernelWorkGroupInfo"); - __clewWaitForEvents = (PFNCLWAITFOREVENTS )CLCC_DYNLIB_IMPORT(module, "clWaitForEvents"); - __clewGetEventInfo = (PFNCLGETEVENTINFO )CLCC_DYNLIB_IMPORT(module, "clGetEventInfo"); - __clewRetainEvent = (PFNCLRETAINEVENT )CLCC_DYNLIB_IMPORT(module, "clRetainEvent"); - __clewReleaseEvent = (PFNCLRELEASEEVENT )CLCC_DYNLIB_IMPORT(module, "clReleaseEvent"); - __clewGetEventProfilingInfo = (PFNCLGETEVENTPROFILINGINFO )CLCC_DYNLIB_IMPORT(module, "clGetEventProfilingInfo"); - __clewFlush = (PFNCLFLUSH )CLCC_DYNLIB_IMPORT(module, "clFlush"); - __clewFinish = (PFNCLFINISH )CLCC_DYNLIB_IMPORT(module, "clFinish"); - __clewEnqueueReadBuffer = (PFNCLENQUEUEREADBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueReadBuffer"); - __clewEnqueueWriteBuffer = (PFNCLENQUEUEWRITEBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueWriteBuffer"); - __clewEnqueueCopyBuffer = (PFNCLENQUEUECOPYBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyBuffer"); - __clewEnqueueReadImage = (PFNCLENQUEUEREADIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueReadImage"); - __clewEnqueueWriteImage = (PFNCLENQUEUEWRITEIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueWriteImage"); - __clewEnqueueCopyImage = (PFNCLENQUEUECOPYIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyImage"); - __clewEnqueueCopyImageToBuffer = (PFNCLENQUEUECOPYIMAGETOBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyImageToBuffer"); - __clewEnqueueCopyBufferToImage = (PFNCLENQUEUECOPYBUFFERTOIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueCopyBufferToImage"); - __clewEnqueueMapBuffer = (PFNCLENQUEUEMAPBUFFER )CLCC_DYNLIB_IMPORT(module, "clEnqueueMapBuffer"); - __clewEnqueueMapImage = (PFNCLENQUEUEMAPIMAGE )CLCC_DYNLIB_IMPORT(module, "clEnqueueMapImage"); - __clewEnqueueUnmapMemObject = (PFNCLENQUEUEUNMAPMEMOBJECT )CLCC_DYNLIB_IMPORT(module, "clEnqueueUnmapMemObject"); - __clewEnqueueNDRangeKernel = (PFNCLENQUEUENDRANGEKERNEL )CLCC_DYNLIB_IMPORT(module, "clEnqueueNDRangeKernel"); - __clewEnqueueTask = (PFNCLENQUEUETASK )CLCC_DYNLIB_IMPORT(module, "clEnqueueTask"); - __clewEnqueueNativeKernel = (PFNCLENQUEUENATIVEKERNEL )CLCC_DYNLIB_IMPORT(module, "clEnqueueNativeKernel"); - __clewEnqueueMarker = (PFNCLENQUEUEMARKER )CLCC_DYNLIB_IMPORT(module, "clEnqueueMarker"); - __clewEnqueueWaitForEvents = (PFNCLENQUEUEWAITFOREVENTS )CLCC_DYNLIB_IMPORT(module, "clEnqueueWaitForEvents"); - __clewEnqueueBarrier = (PFNCLENQUEUEBARRIER )CLCC_DYNLIB_IMPORT(module, "clEnqueueBarrier"); - __clewGetExtensionFunctionAddress = (PFNCLGETEXTENSIONFUNCTIONADDRESS )CLCC_DYNLIB_IMPORT(module, "clGetExtensionFunctionAddress"); - - if(__clewGetPlatformIDs == NULL) return 0; - if(__clewGetPlatformInfo == NULL) return 0; - if(__clewGetDeviceIDs == NULL) return 0; - if(__clewGetDeviceInfo == NULL) return 0; - - return 1; -} - -//! \param error CL error code -//! \return a string representation of the error code -const char *clErrorString(cl_int error) -{ - static const char* strings[] = - { - // Error Codes - "CL_SUCCESS" // 0 - , "CL_DEVICE_NOT_FOUND" // -1 - , "CL_DEVICE_NOT_AVAILABLE" // -2 - , "CL_COMPILER_NOT_AVAILABLE" // -3 - , "CL_MEM_OBJECT_ALLOCATION_FAILURE" // -4 - , "CL_OUT_OF_RESOURCES" // -5 - , "CL_OUT_OF_HOST_MEMORY" // -6 - , "CL_PROFILING_INFO_NOT_AVAILABLE" // -7 - , "CL_MEM_COPY_OVERLAP" // -8 - , "CL_IMAGE_FORMAT_MISMATCH" // -9 - , "CL_IMAGE_FORMAT_NOT_SUPPORTED" // -10 - , "CL_BUILD_PROGRAM_FAILURE" // -11 - , "CL_MAP_FAILURE" // -12 - - , "" // -13 - , "" // -14 - , "" // -15 - , "" // -16 - , "" // -17 - , "" // -18 - , "" // -19 - - , "" // -20 - , "" // -21 - , "" // -22 - , "" // -23 - , "" // -24 - , "" // -25 - , "" // -26 - , "" // -27 - , "" // -28 - , "" // -29 - - , "CL_INVALID_VALUE" // -30 - , "CL_INVALID_DEVICE_TYPE" // -31 - , "CL_INVALID_PLATFORM" // -32 - , "CL_INVALID_DEVICE" // -33 - , "CL_INVALID_CONTEXT" // -34 - , "CL_INVALID_QUEUE_PROPERTIES" // -35 - , "CL_INVALID_COMMAND_QUEUE" // -36 - , "CL_INVALID_HOST_PTR" // -37 - , "CL_INVALID_MEM_OBJECT" // -38 - , "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR" // -39 - , "CL_INVALID_IMAGE_SIZE" // -40 - , "CL_INVALID_SAMPLER" // -41 - , "CL_INVALID_BINARY" // -42 - , "CL_INVALID_BUILD_OPTIONS" // -43 - , "CL_INVALID_PROGRAM" // -44 - , "CL_INVALID_PROGRAM_EXECUTABLE" // -45 - , "CL_INVALID_KERNEL_NAME" // -46 - , "CL_INVALID_KERNEL_DEFINITION" // -47 - , "CL_INVALID_KERNEL" // -48 - , "CL_INVALID_ARG_INDEX" // -49 - , "CL_INVALID_ARG_VALUE" // -50 - , "CL_INVALID_ARG_SIZE" // -51 - , "CL_INVALID_KERNEL_ARGS" // -52 - , "CL_INVALID_WORK_DIMENSION" // -53 - , "CL_INVALID_WORK_GROUP_SIZE" // -54 - , "CL_INVALID_WORK_ITEM_SIZE" // -55 - , "CL_INVALID_GLOBAL_OFFSET" // -56 - , "CL_INVALID_EVENT_WAIT_LIST" // -57 - , "CL_INVALID_EVENT" // -58 - , "CL_INVALID_OPERATION" // -59 - , "CL_INVALID_GL_OBJECT" // -60 - , "CL_INVALID_BUFFER_SIZE" // -61 - , "CL_INVALID_MIP_LEVEL" // -62 - , "CL_INVALID_GLOBAL_WORK_SIZE" // -63 - }; - - return strings[-error]; -} - -CCL_NAMESPACE_END - -#ifdef CLCC_DYNLIB_CLOSE -#endif diff --git a/intern/cycles/util/util_opencl.h b/intern/cycles/util/util_opencl.h deleted file mode 100644 index 141c5e38273..00000000000 --- a/intern/cycles/util/util_opencl.h +++ /dev/null @@ -1,1313 +0,0 @@ -////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2009 Organic Vectory B.V. -// Written by George van Venrooij -// -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file doc/license/Boost.txt) -// Extracted from the CLCC project - http://clcc.sourceforge.net/ -////////////////////////////////////////////////////////////////////////// - -#ifndef __UTIL_OPENCL_H__ -#define __UTIL_OPENCL_H__ - -CCL_NAMESPACE_BEGIN - -//! This file contains a copy of the contents of CL.H and CL_PLATFORM.H from the -//! official OpenCL spec. The purpose of this code is to load the OpenCL dynamic -//! library at run-time and thus allow the executable to function on many -//! platforms regardless of the vendor of the OpenCL driver actually installed. -//! Some of the techniques used here were inspired by work done in the GLEW -//! library (http://glew.sourceforge.net/) - -// Run-time dynamic linking functionality based on concepts used in GLEW -#ifdef __OPENCL_CL_H -#error cl.h included before clew.h -#endif - -#ifdef __OPENCL_CL_PLATFORM_H -#error cl_platform.h included before clew.h -#endif - -#ifndef CLCC_GENERATE_DOCUMENTATION -// Prevent cl.h inclusion -#define __OPENCL_CL_H -// Prevent cl_platform.h inclusion -#define __CL_PLATFORM_H -#endif // CLCC_GENERATE_DOCUMENTATION - -/******************************************************************************* - * Copyright (c) 2008-2009 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - ******************************************************************************/ - -#ifndef CLCC_GENERATE_DOCUMENTATION - -#if defined(_WIN32) -#define CL_API_ENTRY -#define CL_API_CALL __stdcall -#else -#define CL_API_ENTRY -#define CL_API_CALL -#endif - -#define CL_API_SUFFIX__VERSION_1_0 - -#if defined(_WIN32) && defined(_MSC_VER) - -/* scalar types */ -typedef signed __int8 cl_char; -typedef unsigned __int8 cl_uchar; -typedef signed __int16 cl_short; -typedef unsigned __int16 cl_ushort; -typedef signed __int32 cl_int; -typedef unsigned __int32 cl_uint; -typedef signed __int64 cl_long; -typedef unsigned __int64 cl_ulong; - -typedef unsigned __int16 cl_half; -typedef float cl_float; -typedef double cl_double; - - -/* - * Vector types - * - * Note: OpenCL requires that all types be naturally aligned. - * This means that vector types must be naturally aligned. - * For example, a vector of four floats must be aligned to - * a 16 byte boundary (calculated as 4 * the natural 4-byte - * alignment of the float). The alignment qualifiers here - * will only function properly if your compiler supports them - * and if you don't actively work to defeat them. For example, - * in order for a cl_float4 to be 16 byte aligned in a struct, - * the start of the struct must itself be 16-byte aligned. - * - * Maintaining proper alignment is the user's responsibility. - */ -typedef signed __int8 cl_char2[2]; -typedef signed __int8 cl_char4[4]; -typedef signed __int8 cl_char8[8]; -typedef signed __int8 cl_char16[16]; -typedef unsigned __int8 cl_uchar2[2]; -typedef unsigned __int8 cl_uchar4[4]; -typedef unsigned __int8 cl_uchar8[8]; -typedef unsigned __int8 cl_uchar16[16]; - -typedef signed __int16 cl_short2[2]; -typedef signed __int16 cl_short4[4]; -typedef signed __int16 cl_short8[8]; -typedef signed __int16 cl_short16[16]; -typedef unsigned __int16 cl_ushort2[2]; -typedef unsigned __int16 cl_ushort4[4]; -typedef unsigned __int16 cl_ushort8[8]; -typedef unsigned __int16 cl_ushort16[16]; - -typedef signed __int32 cl_int2[2]; -typedef signed __int32 cl_int4[4]; -typedef signed __int32 cl_int8[8]; -typedef signed __int32 cl_int16[16]; -typedef unsigned __int32 cl_uint2[2]; -typedef unsigned __int32 cl_uint4[4]; -typedef unsigned __int32 cl_uint8[8]; -typedef unsigned __int32 cl_uint16[16]; - -typedef signed __int64 cl_long2[2]; -typedef signed __int64 cl_long4[4]; -typedef signed __int64 cl_long8[8]; -typedef signed __int64 cl_long16[16]; -typedef unsigned __int64 cl_ulong2[2]; -typedef unsigned __int64 cl_ulong4[4]; -typedef unsigned __int64 cl_ulong8[8]; -typedef unsigned __int64 cl_ulong16[16]; - -typedef float cl_float2[2]; -typedef float cl_float4[4]; -typedef float cl_float8[8]; -typedef float cl_float16[16]; - -typedef double cl_double2[2]; -typedef double cl_double4[4]; -typedef double cl_double8[8]; -typedef double cl_double16[16]; -/* There are no vector types for half */ - -#else - -#include <stdint.h> - -/* scalar types */ -typedef int8_t cl_char; -typedef uint8_t cl_uchar; -typedef int16_t cl_short __attribute__((aligned(2))); -typedef uint16_t cl_ushort __attribute__((aligned(2))); -typedef int32_t cl_int __attribute__((aligned(4))); -typedef uint32_t cl_uint __attribute__((aligned(4))); -typedef int64_t cl_long __attribute__((aligned(8))); -typedef uint64_t cl_ulong __attribute__((aligned(8))); - -typedef uint16_t cl_half __attribute__((aligned(2))); -typedef float cl_float __attribute__((aligned(4))); -typedef double cl_double __attribute__((aligned(8))); - -/* - * Vector types - * - * Note: OpenCL requires that all types be naturally aligned. - * This means that vector types must be naturally aligned. - * For example, a vector of four floats must be aligned to - * a 16 byte boundary (calculated as 4 * the natural 4-byte - * alignment of the float). The alignment qualifiers here - * will only function properly if your compiler supports them - * and if you don't actively work to defeat them. For example, - * in order for a cl_float4 to be 16 byte aligned in a struct, - * the start of the struct must itself be 16-byte aligned. - * - * Maintaining proper alignment is the user's responsibility. - */ -typedef int8_t cl_char2[2] __attribute__((aligned(2))); -typedef int8_t cl_char4[4] __attribute__((aligned(4))); -typedef int8_t cl_char8[8] __attribute__((aligned(8))); -typedef int8_t cl_char16[16] __attribute__((aligned(16))); -typedef uint8_t cl_uchar2[2] __attribute__((aligned(2))); -typedef uint8_t cl_uchar4[4] __attribute__((aligned(4))); -typedef uint8_t cl_uchar8[8] __attribute__((aligned(8))); -typedef uint8_t cl_uchar16[16] __attribute__((aligned(16))); - -typedef int16_t cl_short2[2] __attribute__((aligned(4))); -typedef int16_t cl_short4[4] __attribute__((aligned(8))); -typedef int16_t cl_short8[8] __attribute__((aligned(16))); -typedef int16_t cl_short16[16] __attribute__((aligned(32))); -typedef uint16_t cl_ushort2[2] __attribute__((aligned(4))); -typedef uint16_t cl_ushort4[4] __attribute__((aligned(8))); -typedef uint16_t cl_ushort8[8] __attribute__((aligned(16))); -typedef uint16_t cl_ushort16[16] __attribute__((aligned(32))); - -typedef int32_t cl_int2[2] __attribute__((aligned(8))); -typedef int32_t cl_int4[4] __attribute__((aligned(16))); -typedef int32_t cl_int8[8] __attribute__((aligned(32))); -typedef int32_t cl_int16[16] __attribute__((aligned(64))); -typedef uint32_t cl_uint2[2] __attribute__((aligned(8))); -typedef uint32_t cl_uint4[4] __attribute__((aligned(16))); -typedef uint32_t cl_uint8[8] __attribute__((aligned(32))); -typedef uint32_t cl_uint16[16] __attribute__((aligned(64))); - -typedef int64_t cl_long2[2] __attribute__((aligned(16))); -typedef int64_t cl_long4[4] __attribute__((aligned(32))); -typedef int64_t cl_long8[8] __attribute__((aligned(64))); -typedef int64_t cl_long16[16] __attribute__((aligned(128))); -typedef uint64_t cl_ulong2[2] __attribute__((aligned(16))); -typedef uint64_t cl_ulong4[4] __attribute__((aligned(32))); -typedef uint64_t cl_ulong8[8] __attribute__((aligned(64))); -typedef uint64_t cl_ulong16[16] __attribute__((aligned(128))); - -typedef float cl_float2[2] __attribute__((aligned(8))); -typedef float cl_float4[4] __attribute__((aligned(16))); -typedef float cl_float8[8] __attribute__((aligned(32))); -typedef float cl_float16[16] __attribute__((aligned(64))); - -typedef double cl_double2[2] __attribute__((aligned(16))); -typedef double cl_double4[4] __attribute__((aligned(32))); -typedef double cl_double8[8] __attribute__((aligned(64))); -typedef double cl_double16[16] __attribute__((aligned(128))); - -/* There are no vector types for half */ - -#endif - -/******************************************************************************/ - -// Macro names and corresponding values defined by OpenCL - -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#if defined(_MSC_VER) -// MSVC doesn't understand hex floats -#define CL_FLT_MAX 3.402823466e+38F -#define CL_FLT_MIN 1.175494351e-38F -#define CL_FLT_EPSILON 1.192092896e-07F -#else -#define CL_FLT_MAX 0x1.fffffep127f -#define CL_FLT_MIN 0x1.0p-126f -#define CL_FLT_EPSILON 0x1.0p-23f -#endif - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#if defined(_MSC_VER) -// MSVC doesn't understand hex floats -#define CL_DBL_MAX 1.7976931348623158e+308 -#define CL_DBL_MIN 2.2250738585072014e-308 -#define CL_DBL_EPSILON 2.2204460492503131e-016 -#else -#define CL_DBL_MAX 0x1.fffffffffffffp1023 -#define CL_DBL_MIN 0x1.0p-1022 -#define CL_DBL_EPSILON 0x1.0p-52 -#endif - -#include <stddef.h> - - -// CL.h contents -/******************************************************************************/ - -typedef struct _cl_platform_id * cl_platform_id; -typedef struct _cl_device_id * cl_device_id; -typedef struct _cl_context * cl_context; -typedef struct _cl_command_queue * cl_command_queue; -typedef struct _cl_mem * cl_mem; -typedef struct _cl_program * cl_program; -typedef struct _cl_kernel * cl_kernel; -typedef struct _cl_event * cl_event; -typedef struct _cl_sampler * cl_sampler; - -/* WARNING! Unlike cl_ types in cl_platform.h, - * cl_bool is not guaranteed to be the same size as the bool in kernels. */ -typedef cl_uint cl_bool; -typedef cl_ulong cl_bitfield; -typedef cl_bitfield cl_device_type; -typedef cl_uint cl_platform_info; -typedef cl_uint cl_device_info; -typedef cl_bitfield cl_device_address_info; -typedef cl_bitfield cl_device_fp_config; -typedef cl_uint cl_device_mem_cache_type; -typedef cl_uint cl_device_local_mem_type; -typedef cl_bitfield cl_device_exec_capabilities; -typedef cl_bitfield cl_command_queue_properties; - -typedef intptr_t cl_context_properties; -typedef cl_uint cl_context_info; -typedef cl_uint cl_command_queue_info; -typedef cl_uint cl_channel_order; -typedef cl_uint cl_channel_type; -typedef cl_bitfield cl_mem_flags; -typedef cl_uint cl_mem_object_type; -typedef cl_uint cl_mem_info; -typedef cl_uint cl_image_info; -typedef cl_uint cl_addressing_mode; -typedef cl_uint cl_filter_mode; -typedef cl_uint cl_sampler_info; -typedef cl_bitfield cl_map_flags; -typedef cl_uint cl_program_info; -typedef cl_uint cl_program_build_info; -typedef cl_int cl_build_status; -typedef cl_uint cl_kernel_info; -typedef cl_uint cl_kernel_work_group_info; -typedef cl_uint cl_event_info; -typedef cl_uint cl_command_type; -typedef cl_uint cl_profiling_info; - -typedef struct _cl_image_format { - cl_channel_order image_channel_order; - cl_channel_type image_channel_data_type; -} cl_image_format; - - - -/******************************************************************************/ - -// Error Codes -#define CL_SUCCESS 0 -#define CL_DEVICE_NOT_FOUND -1 -#define CL_DEVICE_NOT_AVAILABLE -2 -#define CL_COMPILER_NOT_AVAILABLE -3 -#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 -#define CL_OUT_OF_RESOURCES -5 -#define CL_OUT_OF_HOST_MEMORY -6 -#define CL_PROFILING_INFO_NOT_AVAILABLE -7 -#define CL_MEM_COPY_OVERLAP -8 -#define CL_IMAGE_FORMAT_MISMATCH -9 -#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 -#define CL_BUILD_PROGRAM_FAILURE -11 -#define CL_MAP_FAILURE -12 - -#define CL_INVALID_VALUE -30 -#define CL_INVALID_DEVICE_TYPE -31 -#define CL_INVALID_PLATFORM -32 -#define CL_INVALID_DEVICE -33 -#define CL_INVALID_CONTEXT -34 -#define CL_INVALID_QUEUE_PROPERTIES -35 -#define CL_INVALID_COMMAND_QUEUE -36 -#define CL_INVALID_HOST_PTR -37 -#define CL_INVALID_MEM_OBJECT -38 -#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 -#define CL_INVALID_IMAGE_SIZE -40 -#define CL_INVALID_SAMPLER -41 -#define CL_INVALID_BINARY -42 -#define CL_INVALID_BUILD_OPTIONS -43 -#define CL_INVALID_PROGRAM -44 -#define CL_INVALID_PROGRAM_EXECUTABLE -45 -#define CL_INVALID_KERNEL_NAME -46 -#define CL_INVALID_KERNEL_DEFINITION -47 -#define CL_INVALID_KERNEL -48 -#define CL_INVALID_ARG_INDEX -49 -#define CL_INVALID_ARG_VALUE -50 -#define CL_INVALID_ARG_SIZE -51 -#define CL_INVALID_KERNEL_ARGS -52 -#define CL_INVALID_WORK_DIMENSION -53 -#define CL_INVALID_WORK_GROUP_SIZE -54 -#define CL_INVALID_WORK_ITEM_SIZE -55 -#define CL_INVALID_GLOBAL_OFFSET -56 -#define CL_INVALID_EVENT_WAIT_LIST -57 -#define CL_INVALID_EVENT -58 -#define CL_INVALID_OPERATION -59 -#define CL_INVALID_GL_OBJECT -60 -#define CL_INVALID_BUFFER_SIZE -61 -#define CL_INVALID_MIP_LEVEL -62 -#define CL_INVALID_GLOBAL_WORK_SIZE -63 - -// OpenCL Version -#define CL_VERSION_1_0 1 - -// cl_bool -#define CL_FALSE 0 -#define CL_TRUE 1 - -// cl_platform_info -#define CL_PLATFORM_PROFILE 0x0900 -#define CL_PLATFORM_VERSION 0x0901 -#define CL_PLATFORM_NAME 0x0902 -#define CL_PLATFORM_VENDOR 0x0903 -#define CL_PLATFORM_EXTENSIONS 0x0904 - -// cl_device_type - bitfield -#define CL_DEVICE_TYPE_DEFAULT (1 << 0) -#define CL_DEVICE_TYPE_CPU (1 << 1) -#define CL_DEVICE_TYPE_GPU (1 << 2) -#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) -#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF - -// cl_device_info -#define CL_DEVICE_TYPE 0x1000 -#define CL_DEVICE_VENDOR_ID 0x1001 -#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 -#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 -#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 -#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B -#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C -#define CL_DEVICE_ADDRESS_BITS 0x100D -#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E -#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F -#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 -#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 -#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 -#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 -#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 -#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 -#define CL_DEVICE_IMAGE_SUPPORT 0x1016 -#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 -#define CL_DEVICE_MAX_SAMPLERS 0x1018 -#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 -#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A -#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B -#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C -#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D -#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E -#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F -#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 -#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 -#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 -#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 -#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 -#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 -#define CL_DEVICE_ENDIAN_LITTLE 0x1026 -#define CL_DEVICE_AVAILABLE 0x1027 -#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 -#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 -#define CL_DEVICE_QUEUE_PROPERTIES 0x102A -#define CL_DEVICE_NAME 0x102B -#define CL_DEVICE_VENDOR 0x102C -#define CL_DRIVER_VERSION 0x102D -#define CL_DEVICE_PROFILE 0x102E -#define CL_DEVICE_VERSION 0x102F -#define CL_DEVICE_EXTENSIONS 0x1030 -#define CL_DEVICE_PLATFORM 0x1031 -/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */ -/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ -#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 -#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B -#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C -#define CL_DEVICE_OPENCL_C_VERSION 0x103D - -// cl_device_fp_config - bitfield -#define CL_FP_DENORM (1 << 0) -#define CL_FP_INF_NAN (1 << 1) -#define CL_FP_ROUND_TO_NEAREST (1 << 2) -#define CL_FP_ROUND_TO_ZERO (1 << 3) -#define CL_FP_ROUND_TO_INF (1 << 4) -#define CL_FP_FMA (1 << 5) - -// cl_device_mem_cache_type -#define CL_NONE 0x0 -#define CL_READ_ONLY_CACHE 0x1 -#define CL_READ_WRITE_CACHE 0x2 - -// cl_device_local_mem_type -#define CL_LOCAL 0x1 -#define CL_GLOBAL 0x2 - -// cl_device_exec_capabilities - bitfield -#define CL_EXEC_KERNEL (1 << 0) -#define CL_EXEC_NATIVE_KERNEL (1 << 1) - -// cl_command_queue_properties - bitfield -#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) -#define CL_QUEUE_PROFILING_ENABLE (1 << 1) - -// cl_context_info -#define CL_CONTEXT_REFERENCE_COUNT 0x1080 -#define CL_CONTEXT_DEVICES 0x1081 -#define CL_CONTEXT_PROPERTIES 0x1082 - -// cl_context_properties -#define CL_CONTEXT_PLATFORM 0x1084 - -// cl_command_queue_info -#define CL_QUEUE_CONTEXT 0x1090 -#define CL_QUEUE_DEVICE 0x1091 -#define CL_QUEUE_REFERENCE_COUNT 0x1092 -#define CL_QUEUE_PROPERTIES 0x1093 - -// cl_mem_flags - bitfield -#define CL_MEM_READ_WRITE (1 << 0) -#define CL_MEM_WRITE_ONLY (1 << 1) -#define CL_MEM_READ_ONLY (1 << 2) -#define CL_MEM_USE_HOST_PTR (1 << 3) -#define CL_MEM_ALLOC_HOST_PTR (1 << 4) -#define CL_MEM_COPY_HOST_PTR (1 << 5) - -// cl_channel_order -#define CL_R 0x10B0 -#define CL_A 0x10B1 -#define CL_RG 0x10B2 -#define CL_RA 0x10B3 -#define CL_RGB 0x10B4 -#define CL_RGBA 0x10B5 -#define CL_BGRA 0x10B6 -#define CL_ARGB 0x10B7 -#define CL_INTENSITY 0x10B8 -#define CL_LUMINANCE 0x10B9 - -// cl_channel_type -#define CL_SNORM_INT8 0x10D0 -#define CL_SNORM_INT16 0x10D1 -#define CL_UNORM_INT8 0x10D2 -#define CL_UNORM_INT16 0x10D3 -#define CL_UNORM_SHORT_565 0x10D4 -#define CL_UNORM_SHORT_555 0x10D5 -#define CL_UNORM_INT_101010 0x10D6 -#define CL_SIGNED_INT8 0x10D7 -#define CL_SIGNED_INT16 0x10D8 -#define CL_SIGNED_INT32 0x10D9 -#define CL_UNSIGNED_INT8 0x10DA -#define CL_UNSIGNED_INT16 0x10DB -#define CL_UNSIGNED_INT32 0x10DC -#define CL_HALF_FLOAT 0x10DD -#define CL_FLOAT 0x10DE - -// cl_mem_object_type -#define CL_MEM_OBJECT_BUFFER 0x10F0 -#define CL_MEM_OBJECT_IMAGE2D 0x10F1 -#define CL_MEM_OBJECT_IMAGE3D 0x10F2 - -// cl_mem_info -#define CL_MEM_TYPE 0x1100 -#define CL_MEM_FLAGS 0x1101 -#define CL_MEM_SIZE 0x1102 -#define CL_MEM_HOST_PTR 0x1103 -#define CL_MEM_MAP_COUNT 0x1104 -#define CL_MEM_REFERENCE_COUNT 0x1105 -#define CL_MEM_CONTEXT 0x1106 - -// cl_image_info -#define CL_IMAGE_FORMAT 0x1110 -#define CL_IMAGE_ELEMENT_SIZE 0x1111 -#define CL_IMAGE_ROW_PITCH 0x1112 -#define CL_IMAGE_SLICE_PITCH 0x1113 -#define CL_IMAGE_WIDTH 0x1114 -#define CL_IMAGE_HEIGHT 0x1115 -#define CL_IMAGE_DEPTH 0x1116 - -// cl_addressing_mode -#define CL_ADDRESS_NONE 0x1130 -#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 -#define CL_ADDRESS_CLAMP 0x1132 -#define CL_ADDRESS_REPEAT 0x1133 - -// cl_filter_mode -#define CL_FILTER_NEAREST 0x1140 -#define CL_FILTER_LINEAR 0x1141 - -// cl_sampler_info -#define CL_SAMPLER_REFERENCE_COUNT 0x1150 -#define CL_SAMPLER_CONTEXT 0x1151 -#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 -#define CL_SAMPLER_ADDRESSING_MODE 0x1153 -#define CL_SAMPLER_FILTER_MODE 0x1154 - -// cl_map_flags - bitfield -#define CL_MAP_READ (1 << 0) -#define CL_MAP_WRITE (1 << 1) - -// cl_program_info -#define CL_PROGRAM_REFERENCE_COUNT 0x1160 -#define CL_PROGRAM_CONTEXT 0x1161 -#define CL_PROGRAM_NUM_DEVICES 0x1162 -#define CL_PROGRAM_DEVICES 0x1163 -#define CL_PROGRAM_SOURCE 0x1164 -#define CL_PROGRAM_BINARY_SIZES 0x1165 -#define CL_PROGRAM_BINARIES 0x1166 - -// cl_program_build_info -#define CL_PROGRAM_BUILD_STATUS 0x1181 -#define CL_PROGRAM_BUILD_OPTIONS 0x1182 -#define CL_PROGRAM_BUILD_LOG 0x1183 - -// cl_build_status -#define CL_BUILD_SUCCESS 0 -#define CL_BUILD_NONE -1 -#define CL_BUILD_ERROR -2 -#define CL_BUILD_IN_PROGRESS -3 - -// cl_kernel_info -#define CL_KERNEL_FUNCTION_NAME 0x1190 -#define CL_KERNEL_NUM_ARGS 0x1191 -#define CL_KERNEL_REFERENCE_COUNT 0x1192 -#define CL_KERNEL_CONTEXT 0x1193 -#define CL_KERNEL_PROGRAM 0x1194 - -// cl_kernel_work_group_info -#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 -#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 -#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 - -// cl_event_info -#define CL_EVENT_COMMAND_QUEUE 0x11D0 -#define CL_EVENT_COMMAND_TYPE 0x11D1 -#define CL_EVENT_REFERENCE_COUNT 0x11D2 -#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 - -// cl_command_type -#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 -#define CL_COMMAND_TASK 0x11F1 -#define CL_COMMAND_NATIVE_KERNEL 0x11F2 -#define CL_COMMAND_READ_BUFFER 0x11F3 -#define CL_COMMAND_WRITE_BUFFER 0x11F4 -#define CL_COMMAND_COPY_BUFFER 0x11F5 -#define CL_COMMAND_READ_IMAGE 0x11F6 -#define CL_COMMAND_WRITE_IMAGE 0x11F7 -#define CL_COMMAND_COPY_IMAGE 0x11F8 -#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 -#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA -#define CL_COMMAND_MAP_BUFFER 0x11FB -#define CL_COMMAND_MAP_IMAGE 0x11FC -#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD -#define CL_COMMAND_MARKER 0x11FE -#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF -#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 - -// command execution status -#define CL_COMPLETE 0x0 -#define CL_RUNNING 0x1 -#define CL_SUBMITTED 0x2 -#define CL_QUEUED 0x3 - -// cl_profiling_info -#define CL_PROFILING_COMMAND_QUEUED 0x1280 -#define CL_PROFILING_COMMAND_SUBMIT 0x1281 -#define CL_PROFILING_COMMAND_START 0x1282 -#define CL_PROFILING_COMMAND_END 0x1283 - -/********************************************************************************************************/ - -/********************************************************************************************************/ - -// Function signature typedef's - -// Platform API -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETPLATFORMIDS)(cl_uint /* num_entries */, - cl_platform_id * /* platforms */, - cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETPLATFORMINFO)(cl_platform_id /* platform */, - cl_platform_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Device APIs -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETDEVICEIDS)(cl_platform_id /* platform */, - cl_device_type /* device_type */, - cl_uint /* num_entries */, - cl_device_id * /* devices */, - cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETDEVICEINFO)(cl_device_id /* device */, - cl_device_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Context APIs -typedef CL_API_ENTRY cl_context (CL_API_CALL * -PFNCLCREATECONTEXT)(const cl_context_properties * /* properties */, - cl_uint /* num_devices */, - const cl_device_id * /* devices */, - void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */, - void * /* user_data */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_context (CL_API_CALL * -PFNCLCREATECONTEXTFROMTYPE)(const cl_context_properties * /* properties */, - cl_device_type /* device_type */, - void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */, - void * /* user_data */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINCONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASECONTEXT)(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETCONTEXTINFO)(cl_context /* context */, - cl_context_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Command Queue APIs -typedef CL_API_ENTRY cl_command_queue (CL_API_CALL * -PFNCLCREATECOMMANDQUEUE)(cl_context /* context */, - cl_device_id /* device */, - cl_command_queue_properties /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINCOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASECOMMANDQUEUE)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETCOMMANDQUEUEINFO)(cl_command_queue /* command_queue */, - cl_command_queue_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLSETCOMMANDQUEUEPROPERTY)(cl_command_queue /* command_queue */, - cl_command_queue_properties /* properties */, - cl_bool /* enable */, - cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0; - -// Memory Object APIs -typedef CL_API_ENTRY cl_mem (CL_API_CALL * -PFNCLCREATEBUFFER)(cl_context /* context */, - cl_mem_flags /* flags */, - size_t /* size */, - void * /* host_ptr */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL * -PFNCLCREATEIMAGE2D)(cl_context /* context */, - cl_mem_flags /* flags */, - const cl_image_format * /* image_format */, - size_t /* image_width */, - size_t /* image_height */, - size_t /* image_row_pitch */, - void * /* host_ptr */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL * -PFNCLCREATEIMAGE3D)(cl_context /* context */, - cl_mem_flags /* flags */, - const cl_image_format * /* image_format */, - size_t /* image_width */, - size_t /* image_height */, - size_t /* image_depth */, - size_t /* image_row_pitch */, - size_t /* image_slice_pitch */, - void * /* host_ptr */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINMEMOBJECT)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASEMEMOBJECT)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETSUPPORTEDIMAGEFORMATS)(cl_context /* context */, - cl_mem_flags /* flags */, - cl_mem_object_type /* image_type */, - cl_uint /* num_entries */, - cl_image_format * /* image_formats */, - cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETMEMOBJECTINFO)(cl_mem /* memobj */, - cl_mem_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETIMAGEINFO)(cl_mem /* image */, - cl_image_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Sampler APIs -typedef CL_API_ENTRY cl_sampler (CL_API_CALL * -PFNCLCREATESAMPLER)(cl_context /* context */, - cl_bool /* normalized_coords */, - cl_addressing_mode /* addressing_mode */, - cl_filter_mode /* filter_mode */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINSAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASESAMPLER)(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETSAMPLERINFO)(cl_sampler /* sampler */, - cl_sampler_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Program Object APIs -typedef CL_API_ENTRY cl_program (CL_API_CALL * -PFNCLCREATEPROGRAMWITHSOURCE)(cl_context /* context */, - cl_uint /* count */, - const char ** /* strings */, - const size_t * /* lengths */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_program (CL_API_CALL * -PFNCLCREATEPROGRAMWITHBINARY)(cl_context /* context */, - cl_uint /* num_devices */, - const cl_device_id * /* device_list */, - const size_t * /* lengths */, - const unsigned char ** /* binaries */, - cl_int * /* binary_status */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASEPROGRAM)(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLBUILDPROGRAM)(cl_program /* program */, - cl_uint /* num_devices */, - const cl_device_id * /* device_list */, - const char * /* options */, - void (*pfn_notify)(cl_program /* program */, void * /* user_data */), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLUNLOADCOMPILER)(void) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETPROGRAMINFO)(cl_program /* program */, - cl_program_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETPROGRAMBUILDINFO)(cl_program /* program */, - cl_device_id /* device */, - cl_program_build_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Kernel Object APIs -typedef CL_API_ENTRY cl_kernel (CL_API_CALL * -PFNCLCREATEKERNEL)(cl_program /* program */, - const char * /* kernel_name */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLCREATEKERNELSINPROGRAM)(cl_program /* program */, - cl_uint /* num_kernels */, - cl_kernel * /* kernels */, - cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINKERNEL)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASEKERNEL)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLSETKERNELARG)(cl_kernel /* kernel */, - cl_uint /* arg_index */, - size_t /* arg_size */, - const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETKERNELINFO)(cl_kernel /* kernel */, - cl_kernel_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETKERNELWORKGROUPINFO)(cl_kernel /* kernel */, - cl_device_id /* device */, - cl_kernel_work_group_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Event Object APIs -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLWAITFOREVENTS)(cl_uint /* num_events */, - const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETEVENTINFO)(cl_event /* event */, - cl_event_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRETAINEVENT)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLRELEASEEVENT)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; - -// Profiling APIs -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLGETEVENTPROFILINGINFO)(cl_event /* event */, - cl_profiling_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -// Flush and Finish APIs -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLFLUSH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLFINISH)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; - -// Enqueued Commands APIs -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEREADBUFFER)(cl_command_queue /* command_queue */, - cl_mem /* buffer */, - cl_bool /* blocking_read */, - size_t /* offset */, - size_t /* cb */, - void * /* ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEWRITEBUFFER)(cl_command_queue /* command_queue */, - cl_mem /* buffer */, - cl_bool /* blocking_write */, - size_t /* offset */, - size_t /* cb */, - const void * /* ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUECOPYBUFFER)(cl_command_queue /* command_queue */, - cl_mem /* src_buffer */, - cl_mem /* dst_buffer */, - size_t /* src_offset */, - size_t /* dst_offset */, - size_t /* cb */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEREADIMAGE)(cl_command_queue /* command_queue */, - cl_mem /* image */, - cl_bool /* blocking_read */, - const size_t * /* origin[3] */, - const size_t * /* region[3] */, - size_t /* row_pitch */, - size_t /* slice_pitch */, - void * /* ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEWRITEIMAGE)(cl_command_queue /* command_queue */, - cl_mem /* image */, - cl_bool /* blocking_write */, - const size_t * /* origin[3] */, - const size_t * /* region[3] */, - size_t /* input_row_pitch */, - size_t /* input_slice_pitch */, - const void * /* ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUECOPYIMAGE)(cl_command_queue /* command_queue */, - cl_mem /* src_image */, - cl_mem /* dst_image */, - const size_t * /* src_origin[3] */, - const size_t * /* dst_origin[3] */, - const size_t * /* region[3] */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUECOPYIMAGETOBUFFER)(cl_command_queue /* command_queue */, - cl_mem /* src_image */, - cl_mem /* dst_buffer */, - const size_t * /* src_origin[3] */, - const size_t * /* region[3] */, - size_t /* dst_offset */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUECOPYBUFFERTOIMAGE)(cl_command_queue /* command_queue */, - cl_mem /* src_buffer */, - cl_mem /* dst_image */, - size_t /* src_offset */, - const size_t * /* dst_origin[3] */, - const size_t * /* region[3] */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY void * (CL_API_CALL * -PFNCLENQUEUEMAPBUFFER)(cl_command_queue /* command_queue */, - cl_mem /* buffer */, - cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, - size_t /* offset */, - size_t /* cb */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY void * (CL_API_CALL * -PFNCLENQUEUEMAPIMAGE)(cl_command_queue /* command_queue */, - cl_mem /* image */, - cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, - const size_t * /* origin[3] */, - const size_t * /* region[3] */, - size_t * /* image_row_pitch */, - size_t * /* image_slice_pitch */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEUNMAPMEMOBJECT)(cl_command_queue /* command_queue */, - cl_mem /* memobj */, - void * /* mapped_ptr */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUENDRANGEKERNEL)(cl_command_queue /* command_queue */, - cl_kernel /* kernel */, - cl_uint /* work_dim */, - const size_t * /* global_work_offset */, - const size_t * /* global_work_size */, - const size_t * /* local_work_size */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUETASK)(cl_command_queue /* command_queue */, - cl_kernel /* kernel */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUENATIVEKERNEL)(cl_command_queue /* command_queue */, - void (*user_func)(void *), - void * /* args */, - size_t /* cb_args */, - cl_uint /* num_mem_objects */, - const cl_mem * /* mem_list */, - const void ** /* args_mem_loc */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEMARKER)(cl_command_queue /* command_queue */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEWAITFOREVENTS)(cl_command_queue /* command_queue */, - cl_uint /* num_events */, - const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL * -PFNCLENQUEUEBARRIER)(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; - -// Extension function access -// -// Returns the extension function address for the given function name, -// or NULL if a valid function can not be found. The client must -// check to make sure the address is not NULL, before using or -// calling the returned function address. -// -typedef CL_API_ENTRY void * (CL_API_CALL * PFNCLGETEXTENSIONFUNCTIONADDRESS)(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0; - - -#define CLEW_STATIC - -#ifdef CLEW_STATIC -# define CLEWAPI extern -#else -# ifdef CLEW_BUILD -# define CLEWAPI extern __declspec(dllexport) -# else -# define CLEWAPI extern __declspec(dllimport) -# endif -#endif - -#if defined(_WIN32) -#define CLEW_FUN_EXPORT extern -#else -#define CLEW_FUN_EXPORT CLEWAPI -#endif - -#define CLEW_GET_FUN(x) x - - -// Variables holding function entry points -CLEW_FUN_EXPORT PFNCLGETPLATFORMIDS __clewGetPlatformIDs ; -CLEW_FUN_EXPORT PFNCLGETPLATFORMINFO __clewGetPlatformInfo ; -CLEW_FUN_EXPORT PFNCLGETDEVICEIDS __clewGetDeviceIDs ; -CLEW_FUN_EXPORT PFNCLGETDEVICEINFO __clewGetDeviceInfo ; -CLEW_FUN_EXPORT PFNCLCREATECONTEXT __clewCreateContext ; -CLEW_FUN_EXPORT PFNCLCREATECONTEXTFROMTYPE __clewCreateContextFromType ; -CLEW_FUN_EXPORT PFNCLRETAINCONTEXT __clewRetainContext ; -CLEW_FUN_EXPORT PFNCLRELEASECONTEXT __clewReleaseContext ; -CLEW_FUN_EXPORT PFNCLGETCONTEXTINFO __clewGetContextInfo ; -CLEW_FUN_EXPORT PFNCLCREATECOMMANDQUEUE __clewCreateCommandQueue ; -CLEW_FUN_EXPORT PFNCLRETAINCOMMANDQUEUE __clewRetainCommandQueue ; -CLEW_FUN_EXPORT PFNCLRELEASECOMMANDQUEUE __clewReleaseCommandQueue ; -CLEW_FUN_EXPORT PFNCLGETCOMMANDQUEUEINFO __clewGetCommandQueueInfo ; -CLEW_FUN_EXPORT PFNCLSETCOMMANDQUEUEPROPERTY __clewSetCommandQueueProperty ; -CLEW_FUN_EXPORT PFNCLCREATEBUFFER __clewCreateBuffer ; -CLEW_FUN_EXPORT PFNCLCREATEIMAGE2D __clewCreateImage2D ; -CLEW_FUN_EXPORT PFNCLCREATEIMAGE3D __clewCreateImage3D ; -CLEW_FUN_EXPORT PFNCLRETAINMEMOBJECT __clewRetainMemObject ; -CLEW_FUN_EXPORT PFNCLRELEASEMEMOBJECT __clewReleaseMemObject ; -CLEW_FUN_EXPORT PFNCLGETSUPPORTEDIMAGEFORMATS __clewGetSupportedImageFormats ; -CLEW_FUN_EXPORT PFNCLGETMEMOBJECTINFO __clewGetMemObjectInfo ; -CLEW_FUN_EXPORT PFNCLGETIMAGEINFO __clewGetImageInfo ; -CLEW_FUN_EXPORT PFNCLCREATESAMPLER __clewCreateSampler ; -CLEW_FUN_EXPORT PFNCLRETAINSAMPLER __clewRetainSampler ; -CLEW_FUN_EXPORT PFNCLRELEASESAMPLER __clewReleaseSampler ; -CLEW_FUN_EXPORT PFNCLGETSAMPLERINFO __clewGetSamplerInfo ; -CLEW_FUN_EXPORT PFNCLCREATEPROGRAMWITHSOURCE __clewCreateProgramWithSource ; -CLEW_FUN_EXPORT PFNCLCREATEPROGRAMWITHBINARY __clewCreateProgramWithBinary ; -CLEW_FUN_EXPORT PFNCLRETAINPROGRAM __clewRetainProgram ; -CLEW_FUN_EXPORT PFNCLRELEASEPROGRAM __clewReleaseProgram ; -CLEW_FUN_EXPORT PFNCLBUILDPROGRAM __clewBuildProgram ; -CLEW_FUN_EXPORT PFNCLUNLOADCOMPILER __clewUnloadCompiler ; -CLEW_FUN_EXPORT PFNCLGETPROGRAMINFO __clewGetProgramInfo ; -CLEW_FUN_EXPORT PFNCLGETPROGRAMBUILDINFO __clewGetProgramBuildInfo ; -CLEW_FUN_EXPORT PFNCLCREATEKERNEL __clewCreateKernel ; -CLEW_FUN_EXPORT PFNCLCREATEKERNELSINPROGRAM __clewCreateKernelsInProgram ; -CLEW_FUN_EXPORT PFNCLRETAINKERNEL __clewRetainKernel ; -CLEW_FUN_EXPORT PFNCLRELEASEKERNEL __clewReleaseKernel ; -CLEW_FUN_EXPORT PFNCLSETKERNELARG __clewSetKernelArg ; -CLEW_FUN_EXPORT PFNCLGETKERNELINFO __clewGetKernelInfo ; -CLEW_FUN_EXPORT PFNCLGETKERNELWORKGROUPINFO __clewGetKernelWorkGroupInfo ; -CLEW_FUN_EXPORT PFNCLWAITFOREVENTS __clewWaitForEvents ; -CLEW_FUN_EXPORT PFNCLGETEVENTINFO __clewGetEventInfo ; -CLEW_FUN_EXPORT PFNCLRETAINEVENT __clewRetainEvent ; -CLEW_FUN_EXPORT PFNCLRELEASEEVENT __clewReleaseEvent ; -CLEW_FUN_EXPORT PFNCLGETEVENTPROFILINGINFO __clewGetEventProfilingInfo ; -CLEW_FUN_EXPORT PFNCLFLUSH __clewFlush ; -CLEW_FUN_EXPORT PFNCLFINISH __clewFinish ; -CLEW_FUN_EXPORT PFNCLENQUEUEREADBUFFER __clewEnqueueReadBuffer ; -CLEW_FUN_EXPORT PFNCLENQUEUEWRITEBUFFER __clewEnqueueWriteBuffer ; -CLEW_FUN_EXPORT PFNCLENQUEUECOPYBUFFER __clewEnqueueCopyBuffer ; -CLEW_FUN_EXPORT PFNCLENQUEUEREADIMAGE __clewEnqueueReadImage ; -CLEW_FUN_EXPORT PFNCLENQUEUEWRITEIMAGE __clewEnqueueWriteImage ; -CLEW_FUN_EXPORT PFNCLENQUEUECOPYIMAGE __clewEnqueueCopyImage ; -CLEW_FUN_EXPORT PFNCLENQUEUECOPYIMAGETOBUFFER __clewEnqueueCopyImageToBuffer ; -CLEW_FUN_EXPORT PFNCLENQUEUECOPYBUFFERTOIMAGE __clewEnqueueCopyBufferToImage ; -CLEW_FUN_EXPORT PFNCLENQUEUEMAPBUFFER __clewEnqueueMapBuffer ; -CLEW_FUN_EXPORT PFNCLENQUEUEMAPIMAGE __clewEnqueueMapImage ; -CLEW_FUN_EXPORT PFNCLENQUEUEUNMAPMEMOBJECT __clewEnqueueUnmapMemObject ; -CLEW_FUN_EXPORT PFNCLENQUEUENDRANGEKERNEL __clewEnqueueNDRangeKernel ; -CLEW_FUN_EXPORT PFNCLENQUEUETASK __clewEnqueueTask ; -CLEW_FUN_EXPORT PFNCLENQUEUENATIVEKERNEL __clewEnqueueNativeKernel ; -CLEW_FUN_EXPORT PFNCLENQUEUEMARKER __clewEnqueueMarker ; -CLEW_FUN_EXPORT PFNCLENQUEUEWAITFOREVENTS __clewEnqueueWaitForEvents ; -CLEW_FUN_EXPORT PFNCLENQUEUEBARRIER __clewEnqueueBarrier ; -CLEW_FUN_EXPORT PFNCLGETEXTENSIONFUNCTIONADDRESS __clewGetExtensionFunctionAddress ; - - -#define clGetPlatformIDs CLEW_GET_FUN(__clewGetPlatformIDs ) -#define clGetPlatformInfo CLEW_GET_FUN(__clewGetPlatformInfo ) -#define clGetDeviceIDs CLEW_GET_FUN(__clewGetDeviceIDs ) -#define clGetDeviceInfo CLEW_GET_FUN(__clewGetDeviceInfo ) -#define clCreateContext CLEW_GET_FUN(__clewCreateContext ) -#define clCreateContextFromType CLEW_GET_FUN(__clewCreateContextFromType ) -#define clRetainContext CLEW_GET_FUN(__clewRetainContext ) -#define clReleaseContext CLEW_GET_FUN(__clewReleaseContext ) -#define clGetContextInfo CLEW_GET_FUN(__clewGetContextInfo ) -#define clCreateCommandQueue CLEW_GET_FUN(__clewCreateCommandQueue ) -#define clRetainCommandQueue CLEW_GET_FUN(__clewRetainCommandQueue ) -#define clReleaseCommandQueue CLEW_GET_FUN(__clewReleaseCommandQueue ) -#define clGetCommandQueueInfo CLEW_GET_FUN(__clewGetCommandQueueInfo ) -#define clSetCommandQueueProperty CLEW_GET_FUN(__clewSetCommandQueueProperty ) -#define clCreateBuffer CLEW_GET_FUN(__clewCreateBuffer ) -#define clCreateImage2D CLEW_GET_FUN(__clewCreateImage2D ) -#define clCreateImage3D CLEW_GET_FUN(__clewCreateImage3D ) -#define clRetainMemObject CLEW_GET_FUN(__clewRetainMemObject ) -#define clReleaseMemObject CLEW_GET_FUN(__clewReleaseMemObject ) -#define clGetSupportedImageFormats CLEW_GET_FUN(__clewGetSupportedImageFormats ) -#define clGetMemObjectInfo CLEW_GET_FUN(__clewGetMemObjectInfo ) -#define clGetImageInfo CLEW_GET_FUN(__clewGetImageInfo ) -#define clCreateSampler CLEW_GET_FUN(__clewCreateSampler ) -#define clRetainSampler CLEW_GET_FUN(__clewRetainSampler ) -#define clReleaseSampler CLEW_GET_FUN(__clewReleaseSampler ) -#define clGetSamplerInfo CLEW_GET_FUN(__clewGetSamplerInfo ) -#define clCreateProgramWithSource CLEW_GET_FUN(__clewCreateProgramWithSource ) -#define clCreateProgramWithBinary CLEW_GET_FUN(__clewCreateProgramWithBinary ) -#define clRetainProgram CLEW_GET_FUN(__clewRetainProgram ) -#define clReleaseProgram CLEW_GET_FUN(__clewReleaseProgram ) -#define clBuildProgram CLEW_GET_FUN(__clewBuildProgram ) -#define clUnloadCompiler CLEW_GET_FUN(__clewUnloadCompiler ) -#define clGetProgramInfo CLEW_GET_FUN(__clewGetProgramInfo ) -#define clGetProgramBuildInfo CLEW_GET_FUN(__clewGetProgramBuildInfo ) -#define clCreateKernel CLEW_GET_FUN(__clewCreateKernel ) -#define clCreateKernelsInProgram CLEW_GET_FUN(__clewCreateKernelsInProgram ) -#define clRetainKernel CLEW_GET_FUN(__clewRetainKernel ) -#define clReleaseKernel CLEW_GET_FUN(__clewReleaseKernel ) -#define clSetKernelArg CLEW_GET_FUN(__clewSetKernelArg ) -#define clGetKernelInfo CLEW_GET_FUN(__clewGetKernelInfo ) -#define clGetKernelWorkGroupInfo CLEW_GET_FUN(__clewGetKernelWorkGroupInfo ) -#define clWaitForEvents CLEW_GET_FUN(__clewWaitForEvents ) -#define clGetEventInfo CLEW_GET_FUN(__clewGetEventInfo ) -#define clRetainEvent CLEW_GET_FUN(__clewRetainEvent ) -#define clReleaseEvent CLEW_GET_FUN(__clewReleaseEvent ) -#define clGetEventProfilingInfo CLEW_GET_FUN(__clewGetEventProfilingInfo ) -#define clFlush CLEW_GET_FUN(__clewFlush ) -#define clFinish CLEW_GET_FUN(__clewFinish ) -#define clEnqueueReadBuffer CLEW_GET_FUN(__clewEnqueueReadBuffer ) -#define clEnqueueWriteBuffer CLEW_GET_FUN(__clewEnqueueWriteBuffer ) -#define clEnqueueCopyBuffer CLEW_GET_FUN(__clewEnqueueCopyBuffer ) -#define clEnqueueReadImage CLEW_GET_FUN(__clewEnqueueReadImage ) -#define clEnqueueWriteImage CLEW_GET_FUN(__clewEnqueueWriteImage ) -#define clEnqueueCopyImage CLEW_GET_FUN(__clewEnqueueCopyImage ) -#define clEnqueueCopyImageToBuffer CLEW_GET_FUN(__clewEnqueueCopyImageToBuffer ) -#define clEnqueueCopyBufferToImage CLEW_GET_FUN(__clewEnqueueCopyBufferToImage ) -#define clEnqueueMapBuffer CLEW_GET_FUN(__clewEnqueueMapBuffer ) -#define clEnqueueMapImage CLEW_GET_FUN(__clewEnqueueMapImage ) -#define clEnqueueUnmapMemObject CLEW_GET_FUN(__clewEnqueueUnmapMemObject ) -#define clEnqueueNDRangeKernel CLEW_GET_FUN(__clewEnqueueNDRangeKernel ) -#define clEnqueueTask CLEW_GET_FUN(__clewEnqueueTask ) -#define clEnqueueNativeKernel CLEW_GET_FUN(__clewEnqueueNativeKernel ) -#define clEnqueueMarker CLEW_GET_FUN(__clewEnqueueMarker ) -#define clEnqueueWaitForEvents CLEW_GET_FUN(__clewEnqueueWaitForEvents ) -#define clEnqueueBarrier CLEW_GET_FUN(__clewEnqueueBarrier ) -#define clGetExtensionFunctionAddress CLEW_GET_FUN(__clewGetExtensionFunctionAddress ) - -#endif // CLCC_GENERATE_DOCUMENTATION - -#define CLEW_SUCCESS 0 //!< Success error code -#define CLEW_ERROR_OPEN_FAILED -1 //!< Error code for failing to open the dynamic library -#define CLEW_ERROR_ATEXIT_FAILED -2 //!< Error code for failing to queue the closing of the dynamic library to atexit() - -int clLibraryInit(void); -const char *clErrorString(cl_int error); - -CCL_NAMESPACE_END - -#endif /* __UTIL_OPENCL_H__ */ - diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h index 04a3e039c9d..667a5db653d 100644 --- a/intern/cycles/util/util_opengl.h +++ b/intern/cycles/util/util_opengl.h @@ -20,7 +20,6 @@ /* OpenGL header includes, used everywhere we use OpenGL, to deal with * platform differences in one central place. */ -#include <GL/glew.h> +#include "glew-mx.h" #endif /* __UTIL_OPENGL_H__ */ - diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index f901513ec4b..2feb3d6ab7e 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -65,10 +65,8 @@ #define WITH_CYCLES_OPTIMIZED_KERNEL_AVX #endif -/* MSVC 2008, no SSE41 (broken blendv intrinsic) and no AVX support */ -#if defined(_MSC_VER) && (_MSC_VER < 1700) -#undef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -#undef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +#ifdef WITH_KERNEL_AVX2 +#define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 #endif #endif @@ -101,6 +99,10 @@ /* SSE intrinsics headers */ #ifndef FREE_WINDOWS64 +#ifdef _MSC_VER +#include <intrin.h> +#else + #ifdef __KERNEL_SSE2__ #include <xmmintrin.h> /* SSE 1 */ #include <emmintrin.h> /* SSE 2 */ @@ -118,6 +120,12 @@ #include <smmintrin.h> /* SSE 4.1 */ #endif +#ifdef __KERNEL_AVX__ +#include <immintrin.h> /* AVX */ +#endif + +#endif + #else /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 85d19b6a325..aa424045ece 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -41,21 +41,12 @@ static string cached_user_path = ""; static boost::filesystem::path to_boost(const string& path) { -#ifdef _MSC_VER - std::wstring path_utf16 = Strutil::utf8_to_utf16(path.c_str()); - return boost::filesystem::path(path_utf16.c_str()); -#else return boost::filesystem::path(path.c_str()); -#endif } static string from_boost(const boost::filesystem::path& path) { -#ifdef _MSC_VER - return Strutil::utf16_to_utf8(path.wstring().c_str()); -#else return path.string().c_str(); -#endif } void path_init(const string& path, const string& user_path) @@ -259,14 +250,7 @@ string path_source_replace_includes(const string& source_, const string& path) FILE *path_fopen(const string& path, const string& mode) { -#ifdef _WIN32 - std::wstring path_utf16 = Strutil::utf8_to_utf16(path); - std::wstring mode_utf16 = Strutil::utf8_to_utf16(mode); - - return _wfopen(path_utf16.c_str(), mode_utf16.c_str()); -#else return fopen(path.c_str(), mode.c_str()); -#endif } void path_cache_clear_except(const string& name, const set<string>& except) diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 5d1219bfef3..e721a3f5047 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -149,6 +149,12 @@ public: sample++; } + void increment_sample_update() + { + increment_sample(); + set_update(); + } + int get_sample() { return sample; diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp new file mode 100644 index 00000000000..0436823e62a --- /dev/null +++ b/intern/cycles/util/util_simd.cpp @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifdef WITH_KERNEL_SSE2 + +#define __KERNEL_SSE2__ +#include "util_simd.h" + +CCL_NAMESPACE_BEGIN + +const __m128 _mm_lookupmask_ps[16] = { + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) +}; + + +CCL_NAMESPACE_END + +#endif // WITH_KERNEL_SSE2 diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index f0f37fa57aa..39506a6359b 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -1,7 +1,8 @@ /* - * Copyright 2011-2013 Blender Foundation + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. * - * Licensed under the Apache License, Version 2.0 (the "License"); + * Licensed under the Apache License, Version 2.0(the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * @@ -14,263 +15,425 @@ * limitations under the License */ -#ifndef __UTIL_SIMD_H__ -#define __UTIL_SIMD_H__ +#ifndef __UTIL_SIMD_TYPES_H__ +#define __UTIL_SIMD_TYPES_H__ + +#include <limits> + +#include "util_debug.h" +#include "util_types.h" CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ -/* SSE shuffle utility functions */ +struct sseb; +struct ssei; +struct ssef; + +extern const __m128 _mm_lookupmask_ps[16]; + +/* Special Types */ -#ifdef __KERNEL_SSSE3__ +static struct TrueTy { +__forceinline operator bool( ) const { return true; } +} True ccl_maybe_unused; -/* faster version for SSSE3 */ -typedef __m128i shuffle_swap_t; +static struct FalseTy { +__forceinline operator bool( ) const { return false; } +} False ccl_maybe_unused; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +static struct NegInfTy { - return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -} +__forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); } +__forceinline operator int ( ) const { return std::numeric_limits<int>::min(); } +} neg_inf ccl_maybe_unused; -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +static struct PosInfTy { - return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); +__forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); } +__forceinline operator int ( ) const { return std::numeric_limits<int>::max(); } +} inf ccl_maybe_unused, pos_inf ccl_maybe_unused; + +/* Intrinsics Functions */ + +#if defined(__BMI__) && defined(__GNUC__) +#define _tzcnt_u32 __tzcnt_u32 +#define _tzcnt_u64 __tzcnt_u64 +#endif + +#if defined(__LZCNT__) +#define _lzcnt_u32 __lzcnt32 +#define _lzcnt_u64 __lzcnt64 +#endif + +#if defined(_WIN32) && !defined(__MINGW32__) + +__forceinline int __popcnt(int in) { + return _mm_popcnt_u32(in); } -ccl_device_inline const __m128 shuffle_swap(const __m128& a, const shuffle_swap_t& shuf) -{ - return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); +#if !defined(_MSC_VER) +__forceinline unsigned int __popcnt(unsigned int in) { + return _mm_popcnt_u32(in); +} +#endif + +#if defined(__KERNEL_64_BIT__) +__forceinline long long __popcnt(long long in) { + return _mm_popcnt_u64(in); +} +__forceinline size_t __popcnt(size_t in) { + return _mm_popcnt_u64(in); +} +#endif + +__forceinline int __bsf(int v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif } +__forceinline unsigned int __bsf(unsigned int v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); #else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif +} -/* somewhat slower version for SSE2 */ -typedef int shuffle_swap_t; +__forceinline int __bsr(int v) { + unsigned long r = 0; _BitScanReverse(&r,v); return r; +} -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) -{ - return 0; +__forceinline int __btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) -{ - return 1; +__forceinline int __bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; } -ccl_device_inline const __m128 shuffle_swap(const __m128& a, shuffle_swap_t shuf) -{ - /* shuffle value must be a constant, so we need to branch */ - if(shuf) - return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); - else - return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 1, 0)); +__forceinline int __btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; } +__forceinline int bitscan(int v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); +#else + return __bsf(v); #endif +} -#ifdef __KERNEL_SSE41__ -ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, - const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3]) +__forceinline int clz(const int x) { - const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) }; - idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn); - idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn); - idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn); - - const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - const __m128 shuf_identity_f = _mm_castsi128_ps(shuf_identity); - const __m128 shuf_swap_f = _mm_castsi128_ps(shuf_swap); - shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask))); - shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask))); - shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask))); -} +#if defined(__KERNEL_AVX2__) + return _lzcnt_u32(x); #else -ccl_device_inline void gen_idirsplat_swap(const __m128 &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, - const float3& idir, __m128 idirsplat[3], shuffle_swap_t shufflexyz[3]) -{ - idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn); - idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn); - idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn); - - shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap; - shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap; - shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap; -} + if (UNLIKELY(x == 0)) return 32; + return 31 - __bsr(x); #endif +} -template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a, const __m128& b) +__forceinline int __bscf(int& v) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + int i = __bsf(v); + v &= v-1; + return i; } -template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128 shuffle(const __m128& a) +__forceinline unsigned int __bscf(unsigned int& v) { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))); + unsigned int i = __bsf(v); + v &= v-1; + return i; } -template<> __forceinline const __m128 shuffle<0, 1, 0, 1>(const __m128& a) -{ - return _mm_movelh_ps(a, a); +#if defined(__KERNEL_64_BIT__) + +__forceinline size_t __bsf(size_t v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif } -template<> __forceinline const __m128 shuffle<2, 3, 2, 3>(const __m128& a) -{ - return _mm_movehl_ps(a, a); +__forceinline size_t __bsr(size_t v) { + unsigned long r = 0; _BitScanReverse64(&r,v); return r; } -template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a) -{ - return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); +__forceinline size_t __btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; } -template<size_t i0, size_t i1, size_t i2, size_t i3> ccl_device_inline const __m128i shuffle(const __m128i& a, const __m128i& b) -{ - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); +__forceinline size_t __bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; } -/* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */ -#ifdef __KERNEL_SSE41__ -ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b) -{ - return _mm_blendv_ps(b, a, mask); +__forceinline size_t __btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; } + +__forceinline size_t bitscan(size_t v) { +#if defined(__KERNEL_AVX2__) +#if defined(__KERNEL_64_BIT__) + return _tzcnt_u64(v); #else -ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const __m128& b) -{ - return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b)); -} + return _tzcnt_u32(v); #endif +#else + return __bsf(v); +#endif +} -/* calculate a*b+c (replacement for fused multiply-add on SSE CPUs) */ -ccl_device_inline const __m128 fma(const __m128& a, const __m128& b, const __m128& c) +__forceinline size_t __bscf(size_t& v) { - return _mm_add_ps(_mm_mul_ps(a, b), c); + size_t i = __bsf(v); + v &= v-1; + return i; } -/* calculate a*b-c (replacement for fused multiply-subtract on SSE CPUs) */ -ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m128& c) -{ - return _mm_sub_ps(_mm_mul_ps(a, b), c); +#endif /* __KERNEL_64_BIT__ */ + +#else /* _WIN32 */ + +__forceinline unsigned int __popcnt(unsigned int in) { + int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r; } -/* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */ -ccl_device_inline const __m128 fnma(const __m128& a, const __m128& b, const __m128& c) -{ - return _mm_sub_ps(c, _mm_mul_ps(a, b)); +__forceinline int __bsf(int v) { + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; } -template<size_t N> ccl_device_inline const __m128 broadcast(const __m128& a) -{ - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(N, N, N, N))); +__forceinline int __bsr(int v) { + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; } -template<size_t N> ccl_device_inline const __m128i broadcast(const __m128i& a) -{ - return _mm_shuffle_epi32(a, _MM_SHUFFLE(N, N, N, N)); +__forceinline int __btc(int v, int i) { + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; } -ccl_device_inline const __m128 uint32_to_float(const __m128i &in) -{ - __m128i a = _mm_srli_epi32(in, 16); - __m128i b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff)); - __m128i c = _mm_or_si128(a, _mm_set1_epi32(0x53000000)); - __m128 d = _mm_cvtepi32_ps(b); - __m128 e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000))); - return _mm_add_ps(e, d); +__forceinline int __bts(int v, int i) { + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; } -template<size_t S1, size_t S2, size_t S3, size_t S4> -ccl_device_inline const __m128 set_sign_bit(const __m128 &a) -{ - return _mm_xor_ps(a, _mm_castsi128_ps(_mm_setr_epi32(S1 << 31, S2 << 31, S3 << 31, S4 << 31))); +__forceinline int __btr(int v, int i) { + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; } -#ifdef __KERNEL_WITH_SSE_ALIGN__ -ccl_device_inline const __m128 load_m128(const float4 &vec) -{ - return _mm_load_ps(&vec.x); +#if defined(__KERNEL_64_BIT__) || defined(__APPLE__) +__forceinline size_t __bsf(size_t v) { + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; } +#endif -ccl_device_inline const __m128 load_m128(const float3 &vec) -{ - return _mm_load_ps(&vec.x); +__forceinline unsigned int __bsf(unsigned int v) { + unsigned int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; } -#else +__forceinline size_t __bsr(size_t v) { + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +} -ccl_device_inline const __m128 load_m128(const float4 &vec) -{ - return _mm_loadu_ps(&vec.x); +__forceinline size_t __btc(size_t v, size_t i) { + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; } -ccl_device_inline const __m128 load_m128(const float3 &vec) -{ - return _mm_loadu_ps(&vec.x); +__forceinline size_t __bts(size_t v, size_t i) { + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; } -#endif /* __KERNEL_WITH_SSE_ALIGN__ */ -ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b) -{ -#ifdef __KERNEL_SSE41__ - return _mm_dp_ps(a, b, 0x7f); +__forceinline size_t __btr(size_t v, size_t i) { + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +} + +__forceinline int bitscan(int v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); #else - __m128 t = _mm_mul_ps(a, b); - return _mm_set1_ps(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]); + return __bsf(v); #endif } -/* squared length taking only specified axes into account */ -template<size_t X, size_t Y, size_t Z, size_t W> -ccl_device_inline float len_squared(const __m128& a) -{ -#ifndef __KERNEL_SSE41__ - float4& t = (float4 &)a; - return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f); +__forceinline unsigned int bitscan(unsigned int v) { +#if defined(__KERNEL_AVX2__) + return _tzcnt_u32(v); #else - return _mm_cvtss_f32(_mm_dp_ps(a, a, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)); + return __bsf(v); #endif } -ccl_device_inline float dot3(const __m128& a, const __m128& b) -{ -#ifdef __KERNEL_SSE41__ - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7f)); +#if defined(__KERNEL_64_BIT__) || defined(__APPLE__) +__forceinline size_t bitscan(size_t v) { +#if defined(__KERNEL_AVX2__) +#if defined(__KERNEL_64_BIT__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif #else - __m128 t = _mm_mul_ps(a, b); - return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]; + return __bsf(v); #endif } +#endif -ccl_device_inline const __m128 len3_squared_splat(const __m128& a) +__forceinline int clz(const int x) { - return dot3_splat(a, a); +#if defined(__KERNEL_AVX2__) + return _lzcnt_u32(x); +#else + if (UNLIKELY(x == 0)) return 32; + return 31 - __bsr(x); +#endif } -ccl_device_inline float len3_squared(const __m128& a) +__forceinline int __bscf(int& v) { - return dot3(a, a); + int i = bitscan(v); +#if defined(__KERNEL_AVX2__) + v &= v-1; +#else + v = __btc(v,i); +#endif + return i; } -ccl_device_inline float len3(const __m128& a) +__forceinline unsigned int __bscf(unsigned int& v) { - return _mm_cvtss_f32(_mm_sqrt_ss(dot3_splat(a, a))); + unsigned int i = bitscan(v); + v &= v-1; + return i; } -/* calculate shuffled cross product, useful when order of components does not matter */ -ccl_device_inline const __m128 cross_zxy(const __m128& a, const __m128& b) +#if defined(__KERNEL_64_BIT__) || defined(__APPLE__) +__forceinline size_t __bscf(size_t& v) { - return fms(a, shuffle<1, 2, 0, 3>(b), _mm_mul_ps(b, shuffle<1, 2, 0, 3>(a))); + size_t i = bitscan(v); +#if defined(__KERNEL_AVX2__) + v &= v-1; +#else + v = __btc(v,i); +#endif + return i; +} +#endif + +#endif /* _WIN32 */ + +static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; +static const size_t BITSCAN_NO_BIT_SET_64 = 64; + +/* Emulation of SSE4 functions with SSE3 */ + +#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__) + +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _mm_blendv_ps __emu_mm_blendv_ps +__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { + return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); +} + +#define _mm_blend_ps __emu_mm_blend_ps +__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { + assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); +} + +#define _mm_blendv_epi8 __emu_mm_blendv_epi8 +__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { + return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); +} + +#define _mm_mullo_epi32 __emu_mm_mullo_epi32 +__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { + __m128i rvalue; + char* _r = (char*)(&rvalue + 1); + char* _v = (char*)(& value + 1); + char* _i = (char*)(& input + 1); + for ( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i)); + return rvalue; +} + + +#define _mm_min_epi32 __emu_mm_min_epi32 +__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { + return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); +} + +#define _mm_max_epi32 __emu_mm_max_epi32 +__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { + return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); +} + +#define _mm_extract_epi32 __emu_mm_extract_epi32 +__forceinline int _mm_extract_epi32( __m128i input, const int index ) { + switch ( index ) { + case 0: return _mm_cvtsi128_si32(input); + case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); + case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); + case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3))); + default: assert(false); return 0; + } +} + +#define _mm_insert_epi32 __emu_mm_insert_epi32 +__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { + assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -ccl_device_inline const __m128 cross(const __m128& a, const __m128& b) +#define _mm_extract_ps __emu_mm_extract_ps +__forceinline int _mm_extract_ps( __m128 input, const int index ) { + int32* ptr = (int32*)&input; return ptr[index]; +} + +#define _mm_insert_ps __emu_mm_insert_ps +__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) +{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } + +#define _mm_round_ps __emu_mm_round_ps +__forceinline __m128 _mm_round_ps( __m128 value, const int flags ) { - return shuffle<1, 2, 0, 3>(cross_zxy(a, b)); + switch ( flags ) + { + case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); + case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); + case _MM_FROUND_TO_POS_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps( 0.5f)))); + case _MM_FROUND_TO_ZERO : return _mm_cvtepi32_ps(_mm_cvttps_epi32(value)); + } + return value; +} + +#ifdef _M_X64 +#define _mm_insert_epi64 __emu_mm_insert_epi64 +__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { + assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; +} + +#define _mm_extract_epi64 __emu_mm_extract_epi64 +__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { + assert(size_t(index) < 2); + return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); } +#endif + +#endif #endif /* __KERNEL_SSE2__ */ CCL_NAMESPACE_END -#endif /* __UTIL_SIMD_H__ */ +#include "util_math.h" +#include "util_sseb.h" +#include "util_ssei.h" +#include "util_ssef.h" + +#endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h new file mode 100644 index 00000000000..be510256dd3 --- /dev/null +++ b/intern/cycles/util/util_sseb.h @@ -0,0 +1,161 @@ +/* + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_SSEB_H__ +#define __UTIL_SSEB_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_SSE2__ + +/*! 4-wide SSE bool type. */ +struct sseb +{ + typedef sseb Mask; // mask type + typedef ssei Int; // int type + typedef ssef Float; // float type + + enum { size = 4 }; // number of SIMD elements + union { __m128 m128; int32_t v[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline sseb ( ) {} + __forceinline sseb ( const sseb& other ) { m128 = other.m128; } + __forceinline sseb& operator=( const sseb& other ) { m128 = other.m128; return *this; } + + __forceinline sseb( const __m128 input ) : m128(input) {} + __forceinline operator const __m128&( void ) const { return m128; } + __forceinline operator const __m128i( void ) const { return _mm_castps_si128(m128); } + __forceinline operator const __m128d( void ) const { return _mm_castps_pd(m128); } + + __forceinline sseb ( bool a ) + : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline sseb ( bool a, bool b) + : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline sseb ( bool a, bool b, bool c, bool d) + : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline sseb(int mask) { + assert(mask >= 0 && mask < 16); + m128 = _mm_lookupmask_ps[mask]; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline sseb( FalseTy ) : m128(_mm_setzero_ps()) {} + __forceinline sseb( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator []( const size_t i ) const { assert(i < 4); return (_mm_movemask_ps(m128) >> i) & 1; } + __forceinline int32_t& operator []( const size_t i ) { assert(i < 4); return v[i]; } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator !( const sseb& a ) { return _mm_xor_ps(a, sseb(True)); } + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator &( const sseb& a, const sseb& b ) { return _mm_and_ps(a, b); } +__forceinline const sseb operator |( const sseb& a, const sseb& b ) { return _mm_or_ps (a, b); } +__forceinline const sseb operator ^( const sseb& a, const sseb& b ) { return _mm_xor_ps(a, b); } + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator &=( sseb& a, const sseb& b ) { return a = a & b; } +__forceinline const sseb operator |=( sseb& a, const sseb& b ) { return a = a | b; } +__forceinline const sseb operator ^=( sseb& a, const sseb& b ) { return a = a ^ b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator !=( const sseb& a, const sseb& b ) { return _mm_xor_ps(a, b); } +__forceinline const sseb operator ==( const sseb& a, const sseb& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + +__forceinline const sseb select( const sseb& m, const sseb& t, const sseb& f ) { +#if defined(__KERNEL_SSE41__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb unpacklo( const sseb& a, const sseb& b ) { return _mm_unpacklo_ps(a, b); } +__forceinline const sseb unpackhi( const sseb& a, const sseb& b ) { return _mm_unpackhi_ps(a, b); } + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a ) { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +#if defined(__KERNEL_SSE3__) +template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); } +template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); } +template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) { return _mm_castpd_ps(_mm_movedup_pd (a)); } +#endif + +#if defined(__KERNEL_SSE41__) +template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } +template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); } +template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Reduction Operations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_SSE41__) +__forceinline size_t popcnt( const sseb& a ) { return __popcnt(_mm_movemask_ps(a)); } +#else +__forceinline size_t popcnt( const sseb& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } +#endif + +__forceinline bool reduce_and( const sseb& a ) { return _mm_movemask_ps(a) == 0xf; } +__forceinline bool reduce_or ( const sseb& a ) { return _mm_movemask_ps(a) != 0x0; } +__forceinline bool all ( const sseb& b ) { return _mm_movemask_ps(b) == 0xf; } +__forceinline bool any ( const sseb& b ) { return _mm_movemask_ps(b) != 0x0; } +__forceinline bool none ( const sseb& b ) { return _mm_movemask_ps(b) == 0x0; } + +__forceinline size_t movemask( const sseb& a ) { return _mm_movemask_ps(a); } + +#endif + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h new file mode 100644 index 00000000000..f4236cc616e --- /dev/null +++ b/intern/cycles/util/util_ssef.h @@ -0,0 +1,588 @@ +/* + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_SSEF_H__ +#define __UTIL_SSEF_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_SSE2__ + +/*! 4-wide SSE float type. */ +struct ssef +{ + typedef sseb Mask; // mask type + typedef ssei Int; // int type + typedef ssef Float; // float type + + enum { size = 4 }; // number of SIMD elements + union { __m128 m128; float f[4]; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline ssef () {} + __forceinline ssef (const ssef& other) { m128 = other.m128; } + __forceinline ssef& operator=(const ssef& other) { m128 = other.m128; return *this; } + + __forceinline ssef(const __m128 a) : m128(a) {} + __forceinline operator const __m128&(void) const { return m128; } + __forceinline operator __m128&(void) { return m128; } + + __forceinline ssef (float a) : m128(_mm_set1_ps(a)) {} + __forceinline ssef (float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d)) {} + + __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX__) + static __forceinline ssef broadcast(const void* const a) { return _mm_broadcast_ss((float*)a); } +#else + static __forceinline ssef broadcast(const void* const a) { return _mm_set1_ps(*(float*)a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](const size_t i) const { assert(i < 4); return f[i]; } + __forceinline float& operator [](const size_t i) { assert(i < 4); return f[i]; } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const ssef cast (const __m128i& a) { return _mm_castsi128_ps(a); } +__forceinline const ssef operator +(const ssef& a) { return a; } +__forceinline const ssef operator -(const ssef& a) { return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +__forceinline const ssef abs (const ssef& a) { return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#if defined(__KERNEL_SSE41__) +__forceinline const ssef sign (const ssef& a) { return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a,ssef(0.0f))); } +#endif +__forceinline const ssef signmsk (const ssef& a) { return _mm_and_ps(a.m128,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } + +__forceinline const ssef rcp (const ssef& a) { + const ssef r = _mm_rcp_ps(a.m128); + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +} +__forceinline const ssef sqr (const ssef& a) { return _mm_mul_ps(a,a); } +__forceinline const ssef mm_sqrt(const ssef& a) { return _mm_sqrt_ps(a.m128); } +__forceinline const ssef rsqrt(const ssef& a) { + const ssef r = _mm_rsqrt_ps(a.m128); + return _mm_add_ps(_mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r), + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r), _mm_mul_ps(r, r))); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const ssef operator +(const ssef& a, const ssef& b) { return _mm_add_ps(a.m128, b.m128); } +__forceinline const ssef operator +(const ssef& a, const float& b) { return a + ssef(b); } +__forceinline const ssef operator +(const float& a, const ssef& b) { return ssef(a) + b; } + +__forceinline const ssef operator -(const ssef& a, const ssef& b) { return _mm_sub_ps(a.m128, b.m128); } +__forceinline const ssef operator -(const ssef& a, const float& b) { return a - ssef(b); } +__forceinline const ssef operator -(const float& a, const ssef& b) { return ssef(a) - b; } + +__forceinline const ssef operator *(const ssef& a, const ssef& b) { return _mm_mul_ps(a.m128, b.m128); } +__forceinline const ssef operator *(const ssef& a, const float& b) { return a * ssef(b); } +__forceinline const ssef operator *(const float& a, const ssef& b) { return ssef(a) * b; } + +__forceinline const ssef operator /(const ssef& a, const ssef& b) { return _mm_div_ps(a.m128,b.m128); } +__forceinline const ssef operator /(const ssef& a, const float& b) { return a/ssef(b); } +__forceinline const ssef operator /(const float& a, const ssef& b) { return ssef(a)/b; } + +__forceinline const ssef operator^(const ssef& a, const ssef& b) { return _mm_xor_ps(a.m128,b.m128); } +__forceinline const ssef operator^(const ssef& a, const ssei& b) { return _mm_xor_ps(a.m128,_mm_castsi128_ps(b.m128)); } + +__forceinline const ssef operator&(const ssef& a, const ssef& b) { return _mm_and_ps(a.m128,b.m128); } +__forceinline const ssef operator&(const ssef& a, const ssei& b) { return _mm_and_ps(a.m128,_mm_castsi128_ps(b.m128)); } + +__forceinline const ssef andnot(const ssef& a, const ssef& b) { return _mm_andnot_ps(a.m128,b.m128); } + +__forceinline const ssef min(const ssef& a, const ssef& b) { return _mm_min_ps(a.m128,b.m128); } +__forceinline const ssef min(const ssef& a, const float& b) { return _mm_min_ps(a.m128,ssef(b)); } +__forceinline const ssef min(const float& a, const ssef& b) { return _mm_min_ps(ssef(a),b.m128); } + +__forceinline const ssef max(const ssef& a, const ssef& b) { return _mm_max_ps(a.m128,b.m128); } +__forceinline const ssef max(const ssef& a, const float& b) { return _mm_max_ps(a.m128,ssef(b)); } +__forceinline const ssef max(const float& a, const ssef& b) { return _mm_max_ps(ssef(a),b.m128); } + +#if defined(__KERNEL_SSE41__) +__forceinline ssef mini(const ssef& a, const ssef& b) { + const ssei ai = _mm_castps_si128(a); + const ssei bi = _mm_castps_si128(b); + const ssei ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); +} +#endif + +#if defined(__KERNEL_SSE41__) +__forceinline ssef maxi(const ssef& a, const ssef& b) { + const ssei ai = _mm_castps_si128(a); + const ssei bi = _mm_castps_si128(b); + const ssei ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Ternary Operators +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_AVX2__) +__forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmadd_ps(a,b,c); } +__forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return _mm_fmsub_ps(a,b,c); } +__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return _mm_fnmadd_ps(a,b,c); } +__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return _mm_fnmsub_ps(a,b,c); } +#else +__forceinline const ssef madd (const ssef& a, const ssef& b, const ssef& c) { return a*b+c; } +__forceinline const ssef msub (const ssef& a, const ssef& b, const ssef& c) { return a*b-c; } +__forceinline const ssef nmadd(const ssef& a, const ssef& b, const ssef& c) { return -a*b-c;} +__forceinline const ssef nmsub(const ssef& a, const ssef& b, const ssef& c) { return c-a*b; } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssef& operator +=(ssef& a, const ssef& b) { return a = a + b; } +__forceinline ssef& operator +=(ssef& a, const float& b) { return a = a + b; } + +__forceinline ssef& operator -=(ssef& a, const ssef& b) { return a = a - b; } +__forceinline ssef& operator -=(ssef& a, const float& b) { return a = a - b; } + +__forceinline ssef& operator *=(ssef& a, const ssef& b) { return a = a * b; } +__forceinline ssef& operator *=(ssef& a, const float& b) { return a = a * b; } + +__forceinline ssef& operator /=(ssef& a, const ssef& b) { return a = a / b; } +__forceinline ssef& operator /=(ssef& a, const float& b) { return a = a / b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator ==(const ssef& a, const ssef& b) { return _mm_cmpeq_ps(a.m128, b.m128); } +__forceinline const sseb operator ==(const ssef& a, const float& b) { return a == ssef(b); } +__forceinline const sseb operator ==(const float& a, const ssef& b) { return ssef(a) == b; } + +__forceinline const sseb operator !=(const ssef& a, const ssef& b) { return _mm_cmpneq_ps(a.m128, b.m128); } +__forceinline const sseb operator !=(const ssef& a, const float& b) { return a != ssef(b); } +__forceinline const sseb operator !=(const float& a, const ssef& b) { return ssef(a) != b; } + +__forceinline const sseb operator <(const ssef& a, const ssef& b) { return _mm_cmplt_ps(a.m128, b.m128); } +__forceinline const sseb operator <(const ssef& a, const float& b) { return a < ssef(b); } +__forceinline const sseb operator <(const float& a, const ssef& b) { return ssef(a) < b; } + +__forceinline const sseb operator >=(const ssef& a, const ssef& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +__forceinline const sseb operator >=(const ssef& a, const float& b) { return a >= ssef(b); } +__forceinline const sseb operator >=(const float& a, const ssef& b) { return ssef(a) >= b; } + +__forceinline const sseb operator >(const ssef& a, const ssef& b) { return _mm_cmpnle_ps(a.m128, b.m128); } +__forceinline const sseb operator >(const ssef& a, const float& b) { return a > ssef(b); } +__forceinline const sseb operator >(const float& a, const ssef& b) { return ssef(a) > b; } + +__forceinline const sseb operator <=(const ssef& a, const ssef& b) { return _mm_cmple_ps(a.m128, b.m128); } +__forceinline const sseb operator <=(const ssef& a, const float& b) { return a <= ssef(b); } +__forceinline const sseb operator <=(const float& a, const ssef& b) { return ssef(a) <= b; } + +__forceinline const ssef select(const sseb& m, const ssef& t, const ssef& f) { +#ifdef __KERNEL_SSE41__ + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif +} + +__forceinline const ssef select(const ssef& m, const ssef& t, const ssef& f) { +#ifdef __KERNEL_SSE41__ + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif +} + +__forceinline const ssef select(const int mask, const ssef& t, const ssef& f) { +#if defined(__KERNEL_SSE41__) && ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER)) + return _mm_blend_ps(f, t, mask); +#else + return select(sseb(mask),t,f); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +/// Rounding Functions +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_SSE41__) +__forceinline const ssef round_even(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +__forceinline const ssef round_down(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } +__forceinline const ssef round_up (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +__forceinline const ssef round_zero(const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } +__forceinline const ssef floor (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } +__forceinline const ssef ceil (const ssef& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +#endif + +__forceinline ssei truncatei(const ssef& a) { + return _mm_cvttps_epi32(a.m128); +} + +__forceinline ssei floori(const ssef& a) { +#if defined(__KERNEL_SSE41__) + return ssei(floor(a)); +#else + return ssei(a-ssef(0.5f)); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +/// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssef unpacklo(const ssef& a, const ssef& b) { return _mm_unpacklo_ps(a.m128, b.m128); } +__forceinline ssef unpackhi(const ssef& a, const ssef& b) { return _mm_unpackhi_ps(a.m128, b.m128); } + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef& b) { + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssef shuffle(const ssef& a, const ssef& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +#if defined(__KERNEL_SSSE3__) +__forceinline const ssef shuffle8(const ssef& a, const ssei& shuf) { + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); +} +#endif + +#if defined(__KERNEL_SSE3__) +template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef& b) { return _mm_moveldup_ps(b); } +template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef& b) { return _mm_movehdup_ps(b); } +template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef& b) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); } +#endif + +template<size_t i0> __forceinline const ssef shuffle(const ssef& b) { + return shuffle<i0,i0,i0,i0>(b); +} + +#if defined(__KERNEL_SSE41__) && !defined(__GNUC__) +template<size_t i> __forceinline float extract (const ssef& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } +#else +template<size_t i> __forceinline float extract (const ssef& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); } +#endif +template<> __forceinline float extract<0>(const ssef& a) { return _mm_cvtss_f32(a); } + +#if defined(__KERNEL_SSE41__) +template<size_t dst, size_t src, size_t clr> __forceinline const ssef insert(const ssef& a, const ssef& b) { return _mm_insert_ps(a, b,(dst << 4) |(src << 6) | clr); } +template<size_t dst, size_t src> __forceinline const ssef insert(const ssef& a, const ssef& b) { return insert<dst, src, 0>(a, b); } +template<size_t dst> __forceinline const ssef insert(const ssef& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } +#else +template<size_t dst> __forceinline const ssef insert(const ssef& a, const float b) { ssef c = a; c[dst] = b; return c; } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Transpose +//////////////////////////////////////////////////////////////////////////////// + +__forceinline void transpose(const ssef& r0, const ssef& r1, const ssef& r2, const ssef& r3, ssef& c0, ssef& c1, ssef& c2, ssef& c3) +{ + ssef l02 = unpacklo(r0,r2); + ssef h02 = unpackhi(r0,r2); + ssef l13 = unpacklo(r1,r3); + ssef h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); +} + +__forceinline void transpose(const ssef& r0, const ssef& r1, const ssef& r2, const ssef& r3, ssef& c0, ssef& c1, ssef& c2) +{ + ssef l02 = unpacklo(r0,r2); + ssef h02 = unpackhi(r0,r2); + ssef l13 = unpacklo(r1,r3); + ssef h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Reductions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const ssef vreduce_min(const ssef& v) { ssef h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } +__forceinline const ssef vreduce_max(const ssef& v) { ssef h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } +__forceinline const ssef vreduce_add(const ssef& v) { ssef h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + +__forceinline float reduce_min(const ssef& v) { return _mm_cvtss_f32(vreduce_min(v)); } +__forceinline float reduce_max(const ssef& v) { return _mm_cvtss_f32(vreduce_max(v)); } +__forceinline float reduce_add(const ssef& v) { return _mm_cvtss_f32(vreduce_add(v)); } + +__forceinline size_t select_min(const ssef& v) { return __bsf(movemask(v == vreduce_min(v))); } +__forceinline size_t select_max(const ssef& v) { return __bsf(movemask(v == vreduce_max(v))); } + +__forceinline size_t select_min(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(pos_inf)); return __bsf(movemask(valid &(a == vreduce_min(a)))); } +__forceinline size_t select_max(const sseb& valid, const ssef& v) { const ssef a = select(valid,v,ssef(neg_inf)); return __bsf(movemask(valid &(a == vreduce_max(a)))); } + +//////////////////////////////////////////////////////////////////////////////// +/// Memory load and store operations +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssef load4f(const float4& a) { +#ifdef __KERNEL_WITH_SSE_ALIGN__ + return _mm_load_ps(&a.x); +#else + return _mm_loadu_ps(&a.x); +#endif +} + +__forceinline ssef load4f(const float3& a) { +#ifdef __KERNEL_WITH_SSE_ALIGN__ + return _mm_load_ps(&a.x); +#else + return _mm_loadu_ps(&a.x); +#endif +} + +__forceinline ssef load4f(const void* const a) { + return _mm_load_ps((float*)a); +} + +__forceinline ssef load1f_first(const float a) { + return _mm_set_ss(a); +} + +__forceinline void store4f(void* ptr, const ssef& v) { + _mm_store_ps((float*)ptr,v); +} + +__forceinline ssef loadu4f(const void* const a) { + return _mm_loadu_ps((float*)a); +} + +__forceinline void storeu4f(void* ptr, const ssef& v) { + _mm_storeu_ps((float*)ptr,v); +} + +__forceinline void store4f(const sseb& mask, void* ptr, const ssef& f) { +#if defined(__KERNEL_AVX__) + _mm_maskstore_ps((float*)ptr,(__m128i)mask,f); +#else + *(ssef*)ptr = select(mask,f,*(ssef*)ptr); +#endif +} + +__forceinline ssef load4f_nt(void* ptr) { +#if defined(__KERNEL_SSE41__) + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); +#else + return _mm_load_ps((float*)ptr); +#endif +} + +__forceinline void store4f_nt(void* ptr, const ssef& v) { +#if defined(__KERNEL_SSE41__) + _mm_stream_ps((float*)ptr,v); +#else + _mm_store_ps((float*)ptr,v); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +/// Euclidian Space Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline float dot(const ssef& a, const ssef& b) { + return reduce_add(a*b); +} + +/* calculate shuffled cross product, useful when order of components does not matter */ +__forceinline ssef cross_zxy(const ssef& a, const ssef& b) +{ + const ssef a0 = a; + const ssef b0 = shuffle<1,2,0,3>(b); + const ssef a1 = shuffle<1,2,0,3>(a); + const ssef b1 = b; + return msub(a0,b0,a1*b1); +} + +__forceinline ssef cross(const ssef& a, const ssef& b) +{ + return shuffle<1,2,0,3>(cross_zxy(a, b)); +} + +ccl_device_inline const ssef dot3_splat(const ssef& a, const ssef& b) +{ +#ifdef __KERNEL_SSE41__ + return _mm_dp_ps(a.m128, b.m128, 0x7f); +#else + ssef t = a * b; + return ssef(((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]); +#endif +} + +/* squared length taking only specified axes into account */ +template<size_t X, size_t Y, size_t Z, size_t W> +ccl_device_inline float len_squared(const ssef& a) +{ +#ifndef __KERNEL_SSE41__ + float4& t = (float4 &)a; + return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f); +#else + return extract<0>(ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf))); +#endif +} + +ccl_device_inline float dot3(const ssef& a, const ssef& b) +{ +#ifdef __KERNEL_SSE41__ + return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f))); +#else + ssef t = a * b; + return ((float*)&t)[0] + ((float*)&t)[1] + ((float*)&t)[2]; +#endif +} + +ccl_device_inline const ssef len3_squared_splat(const ssef& a) +{ + return dot3_splat(a, a); +} + +ccl_device_inline float len3_squared(const ssef& a) +{ + return dot3(a, a); +} + +ccl_device_inline float len3(const ssef& a) +{ + return extract<0>(mm_sqrt(dot3_splat(a, a))); +} + +/* SSE shuffle utility functions */ + +#ifdef __KERNEL_SSSE3__ + +/* faster version for SSSE3 */ +typedef ssei shuffle_swap_t; + +ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +{ + return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +} + +ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +{ + return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); +} + +ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& shuf) +{ + return cast(_mm_shuffle_epi8(cast(a), shuf)); +} + +#else + +/* somewhat slower version for SSE2 */ +typedef int shuffle_swap_t; + +ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +{ + return 0; +} + +ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +{ + return 1; +} + +ccl_device_inline const ssef shuffle_swap(const ssef& a, shuffle_swap_t shuf) +{ + /* shuffle value must be a constant, so we need to branch */ + if(shuf) + return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2))); + else + return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0))); +} + +#endif + +#ifdef __KERNEL_SSE41__ + +ccl_device_inline void gen_idirsplat_swap(const ssef &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, + const float3& idir, ssef idirsplat[3], shuffle_swap_t shufflexyz[3]) +{ + const __m128 idirsplat_raw[] = { _mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z) }; + idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn); + idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn); + idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn); + + const ssef signmask = cast(ssei(0x80000000)); + const ssef shuf_identity_f = cast(shuf_identity); + const ssef shuf_swap_f = cast(shuf_swap); + + shufflexyz[0] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask))); + shufflexyz[1] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask))); + shufflexyz[2] = _mm_castps_si128(_mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask))); +} + +#else + +ccl_device_inline void gen_idirsplat_swap(const ssef &pn, const shuffle_swap_t &shuf_identity, const shuffle_swap_t &shuf_swap, + const float3& idir, ssef idirsplat[3], shuffle_swap_t shufflexyz[3]) +{ + idirsplat[0] = ssef(idir.x) ^ pn; + idirsplat[1] = ssef(idir.y) ^ pn; + idirsplat[2] = ssef(idir.z) ^ pn; + + shufflexyz[0] = (idir.x >= 0)? shuf_identity: shuf_swap; + shufflexyz[1] = (idir.y >= 0)? shuf_identity: shuf_swap; + shufflexyz[2] = (idir.z >= 0)? shuf_identity: shuf_swap; +} + +#endif + +ccl_device_inline const ssef uint32_to_float(const ssei &in) +{ + ssei a = _mm_srli_epi32(in, 16); + ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff)); + ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000)); + ssef d = _mm_cvtepi32_ps(b); + ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000))); + return _mm_add_ps(e, d); +} + +template<size_t S1, size_t S2, size_t S3, size_t S4> +ccl_device_inline const ssef set_sign_bit(const ssef &a) +{ + return a ^ cast(ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31)); +} + +#endif + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h new file mode 100644 index 00000000000..5f5a8686e35 --- /dev/null +++ b/intern/cycles/util/util_ssei.h @@ -0,0 +1,294 @@ +/* + * Copyright 2011-2013 Intel Corporation + * Modifications Copyright 2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __UTIL_SSEI_H__ +#define __UTIL_SSEI_H__ + +CCL_NAMESPACE_BEGIN + +#ifdef __KERNEL_SSE2__ + +/*! 4-wide SSE integer type. */ +struct ssei +{ + typedef sseb Mask; // mask type + typedef ssei Int; // int type + typedef ssef Float; // float type + + enum { size = 4 }; // number of SIMD elements + union { __m128i m128; int32_t i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline ssei ( ) {} + __forceinline ssei ( const ssei& a ) { m128 = a.m128; } + __forceinline ssei& operator=( const ssei& a ) { m128 = a.m128; return *this; } + + __forceinline ssei( const __m128i a ) : m128(a) {} + __forceinline operator const __m128i&( void ) const { return m128; } + __forceinline operator __m128i&( void ) { return m128; } + + __forceinline ssei ( const int a ) : m128(_mm_set1_epi32(a)) {} + __forceinline ssei ( int a, int b, int c, int d ) : m128(_mm_setr_epi32(a, b, c, d)) {} + + __forceinline explicit ssei( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int32_t& operator []( const size_t index ) const { assert(index < 4); return i[index]; } + __forceinline int32_t& operator []( const size_t index ) { assert(index < 4); return i[index]; } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// Unary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const ssei cast ( const __m128& a ) { return _mm_castps_si128(a); } +__forceinline const ssei operator +( const ssei& a ) { return a; } +__forceinline const ssei operator -( const ssei& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } +#if defined(__KERNEL_SSSE3__) +__forceinline const ssei abs ( const ssei& a ) { return _mm_abs_epi32(a.m128); } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Binary Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const ssei operator +( const ssei& a, const ssei& b ) { return _mm_add_epi32(a.m128, b.m128); } +__forceinline const ssei operator +( const ssei& a, const int32_t& b ) { return a + ssei(b); } +__forceinline const ssei operator +( const int32_t& a, const ssei& b ) { return ssei(a) + b; } + +__forceinline const ssei operator -( const ssei& a, const ssei& b ) { return _mm_sub_epi32(a.m128, b.m128); } +__forceinline const ssei operator -( const ssei& a, const int32_t& b ) { return a - ssei(b); } +__forceinline const ssei operator -( const int32_t& a, const ssei& b ) { return ssei(a) - b; } + +#if defined(__KERNEL_SSE41__) +__forceinline const ssei operator *( const ssei& a, const ssei& b ) { return _mm_mullo_epi32(a.m128, b.m128); } +__forceinline const ssei operator *( const ssei& a, const int32_t& b ) { return a * ssei(b); } +__forceinline const ssei operator *( const int32_t& a, const ssei& b ) { return ssei(a) * b; } +#endif + +__forceinline const ssei operator &( const ssei& a, const ssei& b ) { return _mm_and_si128(a.m128, b.m128); } +__forceinline const ssei operator &( const ssei& a, const int32_t& b ) { return a & ssei(b); } +__forceinline const ssei operator &( const int32_t& a, const ssei& b ) { return ssei(a) & b; } + +__forceinline const ssei operator |( const ssei& a, const ssei& b ) { return _mm_or_si128(a.m128, b.m128); } +__forceinline const ssei operator |( const ssei& a, const int32_t& b ) { return a | ssei(b); } +__forceinline const ssei operator |( const int32_t& a, const ssei& b ) { return ssei(a) | b; } + +__forceinline const ssei operator ^( const ssei& a, const ssei& b ) { return _mm_xor_si128(a.m128, b.m128); } +__forceinline const ssei operator ^( const ssei& a, const int32_t& b ) { return a ^ ssei(b); } +__forceinline const ssei operator ^( const int32_t& a, const ssei& b ) { return ssei(a) ^ b; } + +__forceinline const ssei operator <<( const ssei& a, const int32_t& n ) { return _mm_slli_epi32(a.m128, n); } +__forceinline const ssei operator >>( const ssei& a, const int32_t& n ) { return _mm_srai_epi32(a.m128, n); } + +__forceinline const ssei andnot(const ssei& a, const ssei& b) { return _mm_andnot_si128(a.m128,b.m128); } +__forceinline const ssei andnot(const sseb& a, const ssei& b) { return _mm_andnot_si128(cast(a.m128),b.m128); } +__forceinline const ssei andnot(const ssei& a, const sseb& b) { return _mm_andnot_si128(a.m128,cast(b.m128)); } + +__forceinline const ssei sra ( const ssei& a, const int32_t& b ) { return _mm_srai_epi32(a.m128, b); } +__forceinline const ssei srl ( const ssei& a, const int32_t& b ) { return _mm_srli_epi32(a.m128, b); } + +#if defined(__KERNEL_SSE41__) +__forceinline const ssei min( const ssei& a, const ssei& b ) { return _mm_min_epi32(a.m128, b.m128); } +__forceinline const ssei min( const ssei& a, const int32_t& b ) { return min(a,ssei(b)); } +__forceinline const ssei min( const int32_t& a, const ssei& b ) { return min(ssei(a),b); } + +__forceinline const ssei max( const ssei& a, const ssei& b ) { return _mm_max_epi32(a.m128, b.m128); } +__forceinline const ssei max( const ssei& a, const int32_t& b ) { return max(a,ssei(b)); } +__forceinline const ssei max( const int32_t& a, const ssei& b ) { return max(ssei(a),b); } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Assignment Operators +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssei& operator +=( ssei& a, const ssei& b ) { return a = a + b; } +__forceinline ssei& operator +=( ssei& a, const int32_t& b ) { return a = a + b; } + +__forceinline ssei& operator -=( ssei& a, const ssei& b ) { return a = a - b; } +__forceinline ssei& operator -=( ssei& a, const int32_t& b ) { return a = a - b; } + +#if defined(__KERNEL_SSE41__) +__forceinline ssei& operator *=( ssei& a, const ssei& b ) { return a = a * b; } +__forceinline ssei& operator *=( ssei& a, const int32_t& b ) { return a = a * b; } +#endif + +__forceinline ssei& operator &=( ssei& a, const ssei& b ) { return a = a & b; } +__forceinline ssei& operator &=( ssei& a, const int32_t& b ) { return a = a & b; } + +__forceinline ssei& operator |=( ssei& a, const ssei& b ) { return a = a | b; } +__forceinline ssei& operator |=( ssei& a, const int32_t& b ) { return a = a | b; } + +__forceinline ssei& operator <<=( ssei& a, const int32_t& b ) { return a = a << b; } +__forceinline ssei& operator >>=( ssei& a, const int32_t& b ) { return a = a >> b; } + +//////////////////////////////////////////////////////////////////////////////// +/// Comparison Operators + Select +//////////////////////////////////////////////////////////////////////////////// + +__forceinline const sseb operator ==( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } +__forceinline const sseb operator ==( const ssei& a, const int32_t& b ) { return a == ssei(b); } +__forceinline const sseb operator ==( const int32_t& a, const ssei& b ) { return ssei(a) == b; } + +__forceinline const sseb operator !=( const ssei& a, const ssei& b ) { return !(a == b); } +__forceinline const sseb operator !=( const ssei& a, const int32_t& b ) { return a != ssei(b); } +__forceinline const sseb operator !=( const int32_t& a, const ssei& b ) { return ssei(a) != b; } + +__forceinline const sseb operator < ( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } +__forceinline const sseb operator < ( const ssei& a, const int32_t& b ) { return a < ssei(b); } +__forceinline const sseb operator < ( const int32_t& a, const ssei& b ) { return ssei(a) < b; } + +__forceinline const sseb operator >=( const ssei& a, const ssei& b ) { return !(a < b); } +__forceinline const sseb operator >=( const ssei& a, const int32_t& b ) { return a >= ssei(b); } +__forceinline const sseb operator >=( const int32_t& a, const ssei& b ) { return ssei(a) >= b; } + +__forceinline const sseb operator > ( const ssei& a, const ssei& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } +__forceinline const sseb operator > ( const ssei& a, const int32_t& b ) { return a > ssei(b); } +__forceinline const sseb operator > ( const int32_t& a, const ssei& b ) { return ssei(a) > b; } + +__forceinline const sseb operator <=( const ssei& a, const ssei& b ) { return !(a > b); } +__forceinline const sseb operator <=( const ssei& a, const int32_t& b ) { return a <= ssei(b); } +__forceinline const sseb operator <=( const int32_t& a, const ssei& b ) { return ssei(a) <= b; } + +__forceinline const ssei select( const sseb& m, const ssei& t, const ssei& f ) { +#ifdef __KERNEL_SSE41__ + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif +} + +__forceinline const ssei select( const int mask, const ssei& t, const ssei& f ) { +#if defined(__KERNEL_SSE41__) && ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER)) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(sseb(mask),t,f); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +// Movement/Shifting/Shuffling Functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssei unpacklo( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); } +__forceinline ssei unpackhi( const ssei& a, const ssei& b ) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a.m128), _mm_castsi128_ps(b.m128))); } + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle( const ssei& a ) { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); +} + +template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const ssei shuffle( const ssei& a, const ssei& b ) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); +} + +#if defined(__KERNEL_SSE3__) +template<> __forceinline const ssei shuffle<0, 0, 2, 2>( const ssei& a ) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a))); } +template<> __forceinline const ssei shuffle<1, 1, 3, 3>( const ssei& a ) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a))); } +template<> __forceinline const ssei shuffle<0, 1, 0, 1>( const ssei& a ) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(a))); } +#endif + +template<size_t i0> __forceinline const ssei shuffle( const ssei& b ) { + return shuffle<i0,i0,i0,i0>(b); +} + +#if defined(__KERNEL_SSE41__) +template<size_t src> __forceinline int extract( const ssei& b ) { return _mm_extract_epi32(b, src); } +template<size_t dst> __forceinline const ssei insert( const ssei& a, const int32_t b ) { return _mm_insert_epi32(a, b, dst); } +#else +template<size_t src> __forceinline int extract( const ssei& b ) { return b[src]; } +template<size_t dst> __forceinline const ssei insert( const ssei& a, const int32_t b ) { ssei c = a; c[dst] = b; return c; } +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Reductions +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__KERNEL_SSE41__) +__forceinline const ssei vreduce_min(const ssei& v) { ssei h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } +__forceinline const ssei vreduce_max(const ssei& v) { ssei h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } +__forceinline const ssei vreduce_add(const ssei& v) { ssei h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + +__forceinline int reduce_min(const ssei& v) { return extract<0>(vreduce_min(v)); } +__forceinline int reduce_max(const ssei& v) { return extract<0>(vreduce_max(v)); } +__forceinline int reduce_add(const ssei& v) { return extract<0>(vreduce_add(v)); } + +__forceinline size_t select_min(const ssei& v) { return __bsf(movemask(v == vreduce_min(v))); } +__forceinline size_t select_max(const ssei& v) { return __bsf(movemask(v == vreduce_max(v))); } + +__forceinline size_t select_min(const sseb& valid, const ssei& v) { const ssei a = select(valid,v,ssei((int)pos_inf)); return __bsf(movemask(valid & (a == vreduce_min(a)))); } +__forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a = select(valid,v,ssei((int)neg_inf)); return __bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + +__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); } +__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); } +__forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Memory load and store operations +//////////////////////////////////////////////////////////////////////////////// + +__forceinline ssei load4i( const void* const a ) { + return _mm_load_si128((__m128i*)a); +} + +__forceinline void store4i(void* ptr, const ssei& v) { + _mm_store_si128((__m128i*)ptr,v); +} + +__forceinline void storeu4i(void* ptr, const ssei& v) { + _mm_storeu_si128((__m128i*)ptr,v); +} + +__forceinline void store4i( const sseb& mask, void* ptr, const ssei& i ) { +#if defined (__KERNEL_AVX__) + _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); +#else + *(ssei*)ptr = select(mask,i,*(ssei*)ptr); +#endif +} + +__forceinline ssei load4i_nt (void* ptr) { +#if defined(__KERNEL_SSE41__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif +} + +__forceinline void store4i_nt(void* ptr, const ssei& v) { +#if defined(__KERNEL_SSE41__) + _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif +} + +#endif + +CCL_NAMESPACE_END + +#endif + diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index 62b1f1760d7..8758b823084 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -30,6 +30,7 @@ public: } void mem_free(size_t size) { + assert(mem_used >= size); mem_used -= size; } diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 0764f7d9345..7c0445577e2 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -127,9 +127,12 @@ struct CPUCapabilities { bool sse42; bool sse4a; bool avx; + bool avx2; bool xop; bool fma3; bool fma4; + bool bmi1; + bool bmi2; }; static CPUCapabilities& system_cpu_capabilities() @@ -180,6 +183,11 @@ static CPUCapabilities& system_cpu_capabilities() #endif caps.avx = (xcr_feature_mask & 0x6) == 0x6; } + + __cpuid(result, 0x00000007); + caps.bmi1 = (result[1] & ((int)1 << 3)) != 0; + caps.bmi2 = (result[1] & ((int)1 << 8)) != 0; + caps.avx2 = (result[1] & ((int)1 << 5)) != 0; } #if 0 @@ -221,6 +229,11 @@ bool system_cpu_support_avx() CPUCapabilities& caps = system_cpu_capabilities(); return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; } +bool system_cpu_support_avx2() +{ + CPUCapabilities& caps = system_cpu_capabilities(); + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2; +} #else bool system_cpu_support_sse2() @@ -242,6 +255,10 @@ bool system_cpu_support_avx() { return false; } +bool system_cpu_support_avx2() +{ + return false; +} #endif diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index 4409ea752cd..0e8868c7dfc 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -28,6 +28,7 @@ bool system_cpu_support_sse2(); bool system_cpu_support_sse3(); bool system_cpu_support_sse41(); bool system_cpu_support_avx(); +bool system_cpu_support_avx2(); CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index bfaab3dba3b..2a199e591bf 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -33,14 +33,17 @@ #ifndef __KERNEL_GPU__ -#define ccl_device static inline +# ifdef NDEBUG +# define ccl_device static inline +# else +# define ccl_device static +# endif #define ccl_device_noinline static #define ccl_global #define ccl_constant #define __KERNEL_WITH_SSE_ALIGN__ #if defined(_WIN32) && !defined(FREE_WINDOWS) - #define ccl_device_inline static __forceinline #define ccl_align(...) __declspec(align(__VA_ARGS__)) #ifdef __KERNEL_64_BIT__ @@ -50,7 +53,12 @@ #define ccl_try_align(...) /* not support for function arguments (error C2719) */ #endif #define ccl_may_alias -#define ccl_always_inline __forceinline +# ifdef NDEBUG +# define ccl_always_inline __forceinline +# else +# define ccl_always_inline +# endif +#define ccl_maybe_unused #else @@ -62,6 +70,7 @@ #define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) #define ccl_may_alias __attribute__((__may_alias__)) #define ccl_always_inline __attribute__((always_inline)) +#define ccl_maybe_unused __attribute__((used)) #endif @@ -456,7 +465,6 @@ enum InterpolationType { INTERPOLATION_SMART = 3, }; - /* macros */ /* hints for branch prediction, only use in code that runs a _lot_ */ @@ -473,14 +481,14 @@ enum InterpolationType { * ... the compiler optimizes away the temp var */ #ifdef __GNUC__ #define CHECK_TYPE(var, type) { \ - __typeof(var) *__tmp; \ + typeof(var) *__tmp; \ __tmp = (type *)NULL; \ (void)__tmp; \ } (void)0 #define CHECK_TYPE_PAIR(var_a, var_b) { \ - __typeof(var_a) *__tmp; \ - __tmp = (__typeof(var_b) *)NULL; \ + typeof(var_a) *__tmp; \ + __tmp = (typeof(var_b) *)NULL; \ (void)__tmp; \ } (void)0 #else diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h index 2085177eefa..cc6e8a371ed 100644 --- a/intern/cycles/util/util_vector.h +++ b/intern/cycles/util/util_vector.h @@ -127,8 +127,10 @@ public: } else if(newsize != datasize) { T *newdata = (T*)malloc_aligned(sizeof(T)*newsize, alignment); - memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T)); - free_aligned(data); + if(data) { + memcpy(newdata, data, ((datasize < newsize)? datasize: newsize)*sizeof(T)); + free_aligned(data); + } data = newdata; datasize = newsize; diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp index 6bf9c9ed8c0..fe08389fe3f 100644 --- a/intern/cycles/util/util_view.cpp +++ b/intern/cycles/util/util_view.cpp @@ -248,7 +248,7 @@ void view_main_loop(const char *title, int width, int height, glutInitDisplayMode(GLUT_RGB|GLUT_DOUBLE|GLUT_DEPTH); glutCreateWindow(title); - glewInit(); + mxMakeCurrentContext(mxCreateContext()); view_reshape(width, height); |