diff options
author | Campbell Barton <ideasman42@gmail.com> | 2017-03-25 05:39:47 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2017-03-25 05:39:47 +0300 |
commit | 0c93bc2b637a6a3fe01b174dad2fb61895d004d2 (patch) | |
tree | b877a434d2a7af2dfdf727b9794fa46359ba670a /intern | |
parent | 5d6e9f237be3379b63169f86b8f4117752d095b3 (diff) | |
parent | a7f16c17c260f311e136758497e5490b226ebc03 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
36 files changed, 855 insertions, 679 deletions
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 78f73d8e062..f949b530f90 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -819,7 +819,7 @@ static void create_mesh(Scene *scene, int shader = clamp(p->material_index(), 0, used_shaders.size()-1); bool smooth = p->use_smooth() || use_loop_normals; - vi.reserve(n); + vi.resize(n); for(int i = 0; i < n; i++) { /* NOTE: Autosmooth is already taken care about. */ vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index(); diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index c5f5ffe9928..d00b4b67a58 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -70,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = true; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::BlenderSession(BL::RenderEngine& b_engine, @@ -95,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = false; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::~BlenderSession() @@ -991,10 +993,14 @@ void BlenderSession::update_status_progress() if(substatus.size() > 0) status += " | " + substatus; - if(status != last_status) { + double current_time = time_dt(); + /* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date. + * For headless rendering, only report when something significant changes to keep the console output readable. */ + if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) { b_engine.update_stats("", (timestatus + scene + status).c_str()); b_engine.update_memory_stats(mem_used, mem_peak); last_status = status; + last_status_time = current_time; } if(progress != last_progress) { b_engine.update_progress(progress); diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 700b8acec1b..33b88b9ab5e 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -113,6 +113,7 @@ public: string last_status; string last_error; float last_progress; + double last_status_time; int width, height; double start_resize_time; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 58471ba67c2..08dfa181385 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1500,7 +1500,7 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, - 0, 0, &args, 0)); + 0, 0, (void**)&args, 0)); device->cuda_pop_context(); diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index be7e9fa5e30..59e61aad25c 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -91,6 +91,53 @@ public: static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); static bool use_single_program(); + + /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ + + /* Platform information. */ + static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); + static cl_uint get_num_platforms(); + + static bool get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error = NULL); + static vector<cl_platform_id> get_platforms(); + + static bool get_platform_name(cl_platform_id platform_id, + string *platform_name); + static string get_platform_name(cl_platform_id platform_id); + + static bool get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error = NULL); + static cl_uint get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + static bool get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error = NULL); + static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + /* Device information. */ + static bool get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error = NULL); + + static string get_device_name(cl_device_id device_id); + + static bool get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error = NULL); + static cl_device_type get_device_type(cl_device_id device_id); + + /* Get somewhat more readable device name. + * Main difference is AMD OpenCL here which only gives code name + * for the regular device name. This will give more sane device + * name using some extensions. + */ + static string get_readable_device_name(cl_device_id device_id); }; /* Thread safe cache for contexts and programs. diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 51ff39f0ad3..0328dfed689 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -152,10 +152,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase() void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info, const void * /*private_info*/, size_t /*cb*/, void *user_data) { - char name[256]; - clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL); - - fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info); + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); } bool OpenCLDeviceBase::opencl_version_check() diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 3faae4039e3..d50ae585062 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -41,11 +41,7 @@ static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedF /* Set compute device build option. */ cl_device_type device_type; - device->ciErr = clGetDeviceInfo(device->cdDevice, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); + OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); assert(device->ciErr == CL_SUCCESS); if(device_type == CL_DEVICE_TYPE_GPU) { build_options += " -D__COMPUTE_DEVICE_GPU__"; @@ -346,9 +342,7 @@ public: virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/) { - cl_device_type type; - clGetDeviceInfo(device->cdDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); - + cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index a689c7eae26..1f5b9ee0896 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -600,11 +600,20 @@ bool OpenCLInfo::device_supported(const string& platform_name, const cl_device_id device_id) { cl_device_type device_type; - clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); + if(!get_device_type(device_id, &device_type)) { + return false; + } + string device_name; + if(!get_device_name(device_id, &device_name)) { + return false; + } + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework + * (aka, it will not be on Intel framework). This isn't supported + * and needs an explicit blacklist. + */ + if(strstr(device_name.c_str(), "Iris")) { + return false; + } if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { @@ -718,39 +727,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices return; } + cl_int error; vector<cl_device_id> device_ids; - cl_uint num_devices = 0; vector<cl_platform_id> platform_ids; - cl_uint num_platforms = 0; - /* Get devices. */ - if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || - num_platforms == 0) - { - FIRST_VLOG(2) << "No OpenCL platforms were found."; + /* Get platforms. */ + if(!get_platforms(&platform_ids, &error)) { + FIRST_VLOG(2) << "Error fetching platforms:" + << string(clewErrorString(error)); first_time = false; return; } - platform_ids.resize(num_platforms); - if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) { - FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver.."; + if(platform_ids.size() == 0) { + FIRST_VLOG(2) << "No OpenCL platforms were found."; first_time = false; return; } /* Devices are numbered consecutively across platforms. */ - for(int platform = 0; platform < num_platforms; platform++) { + for(int platform = 0; platform < platform_ids.size(); platform++) { cl_platform_id platform_id = platform_ids[platform]; - char pname[256]; - if(clGetPlatformInfo(platform_id, - CL_PLATFORM_NAME, - sizeof(pname), - &pname, - NULL) != CL_SUCCESS) - { + string platform_name; + if(!get_platform_name(platform_id, &platform_name)) { FIRST_VLOG(2) << "Failed to get platform name, ignoring."; continue; } - string platform_name = pname; FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; if(!platform_version_check(platform_id)) { @@ -758,39 +758,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices << " due to too old compiler version."; continue; } - num_devices = 0; - cl_int ciErr; - if((ciErr = clGetDeviceIDs(platform_id, - device_type, - 0, - NULL, - &num_devices)) != CL_SUCCESS || num_devices == 0) + if(!get_platform_devices(platform_id, + device_type, + &device_ids, + &error)) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch number of devices: " << string(clewErrorString(ciErr)); + << ", failed to fetch of devices: " + << string(clewErrorString(error)); continue; } - device_ids.resize(num_devices); - if(clGetDeviceIDs(platform_id, - device_type, - num_devices, - &device_ids[0], - NULL) != CL_SUCCESS) - { + if(device_ids.size() == 0) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch devices list."; + << ", it has no devices."; continue; } - for(int num = 0; num < num_devices; num++) { - cl_device_id device_id = device_ids[num]; - char device_name[1024] = "\0"; - if(clGetDeviceInfo(device_id, - CL_DEVICE_NAME, - sizeof(device_name), - &device_name, - NULL) != CL_SUCCESS) - { - FIRST_VLOG(2) << "Failed to fetch device name, ignoring."; + for(int num = 0; num < device_ids.size(); num++) { + const cl_device_id device_id = device_ids[num]; + string device_name; + if(!get_device_name(device_id, &device_name, &error)) { + FIRST_VLOG(2) << "Failed to fetch device name: " + << string(clewErrorString(error)) + << ", ignoring."; continue; } if(!device_version_check(device_id)) { @@ -802,24 +791,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices device_supported(platform_name, device_id)) { cl_device_type device_type; - if(clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL) != CL_SUCCESS) - { + if(!get_device_type(device_id, &device_type, &error)) { FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type."; + << ", failed to fetch device type:" + << string(clewErrorString(error)); continue; } - FIRST_VLOG(2) << "Adding new device " << device_name << "."; + string readable_device_name = + get_readable_device_name(device_id); + if(readable_device_name != device_name) { + FIRST_VLOG(2) << "Using more readable device name: " + << readable_device_name; + } + FIRST_VLOG(2) << "Adding new device " + << readable_device_name << "."; string hardware_id = get_hardware_id(platform_name, device_id); - usable_devices->push_back(OpenCLPlatformDevice(platform_id, - platform_name, - device_id, - device_type, - device_name, - hardware_id)); + usable_devices->push_back(OpenCLPlatformDevice( + platform_id, + platform_name, + device_id, + device_type, + readable_device_name, + hardware_id)); } else { FIRST_VLOG(2) << "Ignoring device " << device_name @@ -830,6 +823,252 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices first_time = false; } +bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error) +{ + /* Reset from possible previous state. */ + platform_ids->resize(0); + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms, error)) { + return false; + } + /* Get actual platforms. */ + cl_int err; + platform_ids->resize(num_platforms); + if((err = clGetPlatformIDs(num_platforms, + &platform_ids->at(0), + NULL)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_platform_id> OpenCLInfo::get_platforms() +{ + vector<cl_platform_id> platform_ids; + get_platforms(&platform_ids); + return platform_ids; +} + +bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) +{ + cl_int err; + if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + *num_platforms = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platforms() +{ + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms)) { + return 0; + } + return num_platforms; +} + +bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, + string *platform_name) +{ + char buffer[256]; + if(clGetPlatformInfo(platform_id, + CL_PLATFORM_NAME, + sizeof(buffer), + &buffer, + NULL) != CL_SUCCESS) + { + *platform_name = ""; + return false; + } + *platform_name = buffer; + return true; +} + +string OpenCLInfo::get_platform_name(cl_platform_id platform_id) +{ + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + return ""; + } + return platform_name; +} + +bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error) +{ + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + 0, + NULL, + num_devices)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *num_devices = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices)) + { + return 0; + } + return num_devices; +} + +bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error) +{ + /* Reset from possible previous state. */ + device_ids->resize(0); + /* Get number of devices to pre-allocate memory. */ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices, + error)) + { + return false; + } + /* Get actual device list. */ + device_ids->resize(num_devices); + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + num_devices, + &device_ids->at(0), + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + vector<cl_device_id> devices; + get_platform_devices(platform_id, device_type, &devices); + return devices; +} + +bool OpenCLInfo::get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_NAME, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_name = ""; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + *device_name = buffer; + return true; +} + +string OpenCLInfo::get_device_name(cl_device_id device_id) +{ + string device_name; + if(!get_device_name(device_id, &device_name)) { + return ""; + } + return device_name; +} + +bool OpenCLInfo::get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error) +{ + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_TYPE, + sizeof(cl_device_type), + device_type, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_type = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) +{ + cl_device_type device_type; + if(!get_device_type(device_id, &device_type)) { + return 0; + } + return device_type; +} + +string OpenCLInfo::get_readable_device_name(cl_device_id device_id) +{ + char board_name[1024]; + if(clGetDeviceInfo(device_id, + CL_DEVICE_BOARD_NAME_AMD, + sizeof(board_name), + &board_name, + NULL) == CL_SUCCESS) + { + return board_name; + } + /* Fallback to standard device name API. */ + return get_device_name(device_id); +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 53c872e829d..8a8c3968c02 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -196,6 +196,7 @@ set(SRC_UTIL_HEADERS ../util/util_hash.h ../util/util_math.h ../util/util_math_fast.h + ../util/util_math_intersect.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 321983c1abc..598e138dbea 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, } #ifdef __SUBSURFACE__ +/* Note: ray is passed by value to work around a possible CUDA compiler bug. */ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, - const Ray *ray, + const Ray ray, SubsurfaceIntersection *ss_isect, int subsurface_object, uint *lcg_state, @@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_subsurface_motion(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, @@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, } #endif /* __OBJECT_MOTION__ */ return bvh_intersect_subsurface(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index b4f65bc8efd..8f7c005e961 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -100,8 +100,8 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* traversal loop */ do { @@ -209,9 +209,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { hit = motion_triangle_intersect(kg, + &isect_precalc, isect_array, P, - dir, ray->time, PATH_RAY_SHADOW, object, @@ -314,7 +314,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -354,7 +354,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); /* scale isect->t to adjust for instancing */ for(int i = 0; i < num_hits_in_instance; i++) { @@ -367,7 +367,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 583f7f7c469..cb7a4e3bc31 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -109,8 +109,8 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* traversal loop */ do { @@ -214,9 +214,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, + &isect_precalc, ss_isect, P, - dir, ray->time, object, prim_addr, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 0eca0c8e38b..eac98a3165a 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -104,8 +104,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* traversal loop */ do { @@ -267,9 +267,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(motion_triangle_intersect(kg, + &isect_precalc, isect, P, - dir, ray->time, visibility, object, @@ -358,7 +358,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -395,7 +395,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 136034aa484..da97dae0b99 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -97,8 +97,8 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* traversal loop */ do { @@ -215,9 +215,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } motion_triangle_intersect(kg, + &isect_precalc, isect, P, - dir, ray->time, visibility, object, @@ -243,7 +243,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -286,7 +286,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 6f3346e7634..6efb7e265d0 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -101,8 +101,8 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* traversal loop */ do { @@ -243,9 +243,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } hit = motion_triangle_intersect(kg, + &isect_precalc, isect_array, P, - dir, ray->time, visibility, object, @@ -294,7 +294,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -340,7 +340,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Scale isect->t to adjust for instancing. */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; @@ -352,7 +352,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index 2a4da3eea82..5d960787134 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -96,8 +96,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Traversal loop. */ do { @@ -290,9 +290,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { hit = motion_triangle_intersect(kg, + &isect_precalc, isect_array, P, - dir, ray->time, PATH_RAY_SHADOW, object, @@ -414,7 +414,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); @@ -471,7 +471,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h index a6431a94e6e..d67a7826589 100644 --- a/intern/cycles/kernel/bvh/qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h @@ -105,8 +105,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Traversal loop. */ do { @@ -270,9 +270,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, + &isect_precalc, ss_isect, P, - dir, ray->time, object, prim_addr, diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index c20a8f3703f..d88e0e07203 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -106,8 +106,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Traversal loop. */ do { @@ -354,9 +354,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(motion_triangle_intersect(kg, + &isect_precalc, isect, P, - dir, ray->time, visibility, object, @@ -447,7 +447,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); @@ -489,7 +489,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index 859c5da808b..7cfb2cf8c18 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -91,8 +91,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Traversal loop. */ do { @@ -281,7 +281,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); + motion_triangle_intersect(kg, &isect_precalc, isect, P, ray->time, visibility, object, prim_addr); } break; } @@ -316,7 +316,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); @@ -362,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index bbe588c878f..3dbac8446d7 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -95,8 +95,8 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); + TriangleIsectPrecalc isect_precalc; + ray_triangle_intersect_precalc(dir, &isect_precalc); /* Traversal loop. */ do { @@ -309,7 +309,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + hit = motion_triangle_intersect(kg, &isect_precalc, isect_array, P, ray->time, visibility, object, prim_addr); if(hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -367,7 +367,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -432,7 +432,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); + ray_triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index aa4b91eac40..1dd24078037 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -83,7 +83,7 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); - const float2 cossin_phi = safe_normalize(make_float2(wi_11.x, wi_11.y)); + const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y); diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 7cc840ce78d..4f84b80a5a8 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -264,7 +264,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte int ka = max(k0 - 1, v00.x); int kb = min(k1 + 1, v00.x + v00.y - 1); -#ifdef __KERNEL_AVX2__ +#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800) avxf P_curve_0_1, P_curve_2_3; if(is_curve_primitive) { P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); @@ -299,7 +299,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); -#ifdef __KERNEL_AVX2__ +#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800) const avxf vPP = _mm256_broadcast_ps(&P.m128); const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h index 2500228281e..971a34308f1 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -166,14 +166,15 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, * time and do a ray intersection with the resulting triangle. */ -ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, - Intersection *isect, - float3 P, - float3 dir, - float time, - uint visibility, - int object, - int prim_addr) +ccl_device_inline bool motion_triangle_intersect( + KernelGlobals *kg, + const TriangleIsectPrecalc *isect_precalc, + Intersection *isect, + float3 P, + float time, + uint visibility, + int object, + int prim_addr) { /* Primitive index for vertex location lookup. */ int prim = kernel_tex_fetch(__prim_index, prim_addr); @@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - isect->t, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(ray_triangle_intersect(isect_precalc, + P, + isect->t, +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption @@ -217,9 +222,9 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, #ifdef __SUBSURFACE__ ccl_device_inline void motion_triangle_intersect_subsurface( KernelGlobals *kg, + const TriangleIsectPrecalc *isect_precalc, SubsurfaceIntersection *ss_isect, float3 P, - float3 dir, float time, int object, int prim_addr, @@ -237,11 +242,15 @@ ccl_device_inline void motion_triangle_intersect_subsurface( motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - tmax, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(ray_triangle_intersect(isect_precalc, + P, + tmax, +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { if(ss_isect->hits[i].t == t) { diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 4d234dd62bd..584d0b3508f 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -22,16 +22,6 @@ CCL_NAMESPACE_BEGIN -/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed - * component of float3 value. - */ -#ifndef __KERNEL_CPU__ -# define IDX(vec, idx) \ - ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) )) -#else -# define IDX(vec, idx) ((vec)[idx]) -#endif - /* Ray-Triangle intersection for BVH traversal * * Sven Woop @@ -40,214 +30,50 @@ CCL_NAMESPACE_BEGIN * http://jcgt.org/published/0002/01/05/paper.pdf */ -/* Precalculated data for the ray->tri intersection. */ -typedef struct IsectPrecalc { - /* Maximal dimension kz, and orthogonal dimensions. */ - int kx, ky, kz; - - /* Shear constants. */ - float Sx, Sy, Sz; -} IsectPrecalc; - -#if (defined(__KERNEL_OPENCL_APPLE__)) || \ - (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))) -ccl_device_noinline -#else -ccl_device_inline -#endif -void triangle_intersect_precalc(float3 dir, - IsectPrecalc *isect_precalc) -{ - /* Calculate dimension where the ray direction is maximal. */ -#ifndef __KERNEL_SSE__ - int kz = util_max_axis(make_float3(fabsf(dir.x), - fabsf(dir.y), - fabsf(dir.z))); - int kx = kz + 1; if(kx == 3) kx = 0; - int ky = kx + 1; if(ky == 3) ky = 0; -#else - int kx, ky, kz; - /* Avoiding mispredicted branch on direction. */ - kz = util_max_axis(fabs(dir)); - static const char inc_xaxis[] = {1, 2, 0, 55}; - static const char inc_yaxis[] = {2, 0, 1, 55}; - kx = inc_xaxis[kz]; - ky = inc_yaxis[kz]; -#endif - - float dir_kz = IDX(dir, kz); - - /* Swap kx and ky dimensions to preserve winding direction of triangles. */ - if(dir_kz < 0.0f) { - int tmp = kx; - kx = ky; - ky = tmp; - } - - /* Calculate the shear constants. */ - float inv_dir_z = 1.0f / dir_kz; - isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; - isect_precalc->Sz = inv_dir_z; - - /* Store the dimensions. */ - isect_precalc->kx = kx; - isect_precalc->ky = ky; - isect_precalc->kz = kz; -} - -/* TODO(sergey): Make it general utility function. */ -ccl_device_inline float xor_signmask(float x, int y) -{ - return __int_as_float(__float_as_int(x) ^ y); -} - ccl_device_inline bool triangle_intersect(KernelGlobals *kg, - const IsectPrecalc *isect_precalc, + const TriangleIsectPrecalc *isect_precalc, Intersection *isect, float3 P, uint visibility, int object, int prim_addr) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permute_mask); - const avxf BC_k = shuffle(BC, permute_mask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return false; - } + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return false; - } #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return false; - } - - /* Calculate scaled z-coordinates of vertices and use them to calculate - * the hit distance. - */ - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const int sign_det = (__float_as_int(det) & 0x80000000); - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > isect->t * xor_signmask(det, sign_det))) + float t, u, v; + if(ray_triangle_intersect(isect_precalc, + P, isect->t, +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + float4_to_float3(tri_a), + float4_to_float3(tri_b), + float4_to_float3(tri_c), +#endif + &u, &v, &t)) { - return false; - } - #ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) + /* Visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags. + */ + if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) #endif - { -#ifdef __KERNEL_CUDA__ - if(A == B && B == C) { - return false; + { + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_TRIANGLE; + isect->u = u; + isect->v = v; + isect->t = t; + return true; } -#endif - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - isect->prim = prim_addr; - isect->object = object; - isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; - isect->t = T * inv_det; - return true; } return false; } @@ -260,7 +86,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, #ifdef __SUBSURFACE__ ccl_device_inline void triangle_intersect_subsurface( KernelGlobals *kg, - const IsectPrecalc *isect_precalc, + const TriangleIsectPrecalc *isect_precalc, SubsurfaceIntersection *ss_isect, float3 P, int object, @@ -269,129 +95,30 @@ ccl_device_inline void triangle_intersect_subsurface( uint *lcg_state, int max_hits) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), - tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), - tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); #if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permuteMask); - const avxf BC_k = shuffle(BC, permuteMask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return; - } + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return; - } + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return; - } - - /* Calculate scaled z−coordinates of vertices and use them to calculate - * the hit distance. - */ - const int sign_det = (__float_as_int(det) & 0x80000000); - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > tmax * xor_signmask(det, sign_det))) + float t, u, v; + if(!ray_triangle_intersect(isect_precalc, + P, tmax, +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + tri_a, + tri_b, + tri_c, +#endif + &u, &v, &t)) { return; } - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - - const float t = T * inv_det; for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { if(ss_isect->hits[i].t == t) { return; @@ -418,16 +145,18 @@ ccl_device_inline void triangle_intersect_subsurface( isect->prim = prim_addr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; + isect->u = u; + isect->v = v; isect->t = t; /* Record geometric normal. */ - /* TODO(sergey): Use float4_to_float3() on just an edges. */ - const float3 v0 = float4_to_float3(tri_a); - const float3 v1 = float4_to_float3(tri_b); - const float3 v2 = float4_to_float3(tri_c); - ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0)); + /* TODO(sergey): Check whether it's faster to re-use ssef verts. */ +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); +#endif + ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); } #endif @@ -570,6 +299,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, return P; } -#undef IDX - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 6c3ee6b8098..c589c112cc2 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -399,7 +399,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); /* Reject invalid value */ - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -468,7 +468,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi /* Reject invalid value */ float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index e347a1eca18..96276f313f8 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -87,9 +87,9 @@ template<typename T> struct texture { ccl_always_inline avxf fetch_avxf(const int index) { kernel_assert(index >= 0 && (index+1) < width); - ssef *ssefData = (ssef*)data; - ssef *ssefNodeData = &ssefData[index]; - return _mm256_loadu_ps((float *)ssefNodeData); + ssef *ssef_data = (ssef*)data; + ssef *ssef_node_data = &ssef_data[index]; + return _mm256_loadu_ps((float *)ssef_node_data); } #endif diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index bc2d9604122..cf14a159e47 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -157,7 +157,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, return false; if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold; + float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold; if(probability < 1.0f) { if(rand_terminate >= probability) { return false; diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 9bee5603474..7653fa53247 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -20,6 +20,7 @@ #include "util_color.h" #include "util_math.h" #include "util_math_fast.h" +#include "util_math_intersect.h" #include "util_texture.h" #include "util_transform.h" diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index fe88ba4ff05..64d240d779d 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -293,7 +293,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( /* intersect with the same object. if multiple intersections are found it * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ scene_intersect_subsurface(kg, - ray, + *ray, ss_isect, sd->object, lcg_state, @@ -448,7 +448,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS /* intersect with the same object. if multiple intersections are * found it will randomly pick one of them */ SubsurfaceIntersection ss_isect; - scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1); + scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1); /* evaluate bssrdf */ if(ss_isect.num_hits > 0) { diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index d159c801810..dfc44dbbf4a 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -324,7 +324,7 @@ private: class BsdfNode : public ShaderNode { public: explicit BsdfNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(BsdfNode); + SHADER_NODE_BASE_CLASS(BsdfNode) bool has_spatial_varying() { return true; } void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL); @@ -641,7 +641,7 @@ public: class MixClosureWeightNode : public ShaderNode { public: - SHADER_NODE_CLASS(MixClosureWeightNode); + SHADER_NODE_CLASS(MixClosureWeightNode) float weight; float fac; @@ -887,7 +887,7 @@ public: class CurvesNode : public ShaderNode { public: explicit CurvesNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(CurvesNode); + SHADER_NODE_BASE_CLASS(CurvesNode) virtual int get_group() { return NODE_GROUP_LEVEL_3; } diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 0c7bd271371..e7050f9ef37 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -833,7 +833,7 @@ void Session::update_status_time(bool show_pause, bool show_done) int progressive_sample = tile_manager.state.sample; int num_samples = tile_manager.get_num_effective_samples(); - int tile = tile_manager.state.num_rendered_tiles; + int tile = progress.get_finished_tiles(); int num_tiles = tile_manager.state.num_tiles; /* update status */ @@ -841,7 +841,7 @@ void Session::update_status_time(bool show_pause, bool show_done) if(!params.progressive) { const bool is_cpu = params.device.type == DEVICE_CPU; - const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles; + const bool is_last_tile = (tile + 1) == num_tiles; substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index a493c3fa1cd..76d0b9a57c2 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -131,7 +131,6 @@ void TileManager::reset(BufferParams& params_, int num_samples_) state.buffer = BufferParams(); state.sample = range_start_sample - 1; state.num_tiles = 0; - state.num_rendered_tiles = 0; state.num_samples = 0; state.resolution_divider = get_divider(params.width, params.height, start_resolution); state.tiles.clear(); @@ -343,7 +342,6 @@ bool TileManager::next_tile(Tile& tile, int device) tile = Tile(state.tiles[logical_device].front()); state.tiles[logical_device].pop_front(); - state.num_rendered_tiles++; return true; } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 5d92ebac355..85fc89bc397 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -63,7 +63,6 @@ public: int num_samples; int resolution_divider; int num_tiles; - int num_rendered_tiles; /* Total samples over all pixels: Generally num_samples*num_pixels, * but can be higher due to the initial resolution division for previews. */ diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index d8abf671bd6..6bd47120482 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -52,6 +52,7 @@ set(SRC_HEADERS util_math.h util_math_cdf.h util_math_fast.h + util_math_intersect.h util_md5.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 27d4ae510c7..2af0e56325f 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -1241,19 +1241,6 @@ ccl_device_inline float __uint_as_float(uint i) return u.f; } -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)); -} /* Interpolation */ @@ -1271,6 +1258,20 @@ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const #endif +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) +{ + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; +} + +ccl_device_inline bool isfinite_safe(float f) +{ + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); +} + /* Orthonormal vectors */ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b) @@ -1451,181 +1452,9 @@ ccl_device_inline float beta(float x, float y) #endif } -/* Ray Intersection */ - -ccl_device bool ray_sphere_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 sphere_P, float sphere_radius, - float3 *isect_P, float *isect_t) +ccl_device_inline float xor_signmask(float x, int y) { - float3 d = sphere_P - ray_P; - float radiussq = sphere_radius*sphere_radius; - float tsq = dot(d, d); - - if(tsq > radiussq) { /* ray origin outside sphere */ - float tp = dot(d, ray_D); - - if(tp < 0.0f) /* dir points away from sphere */ - return false; - - float dsq = tsq - tp*tp; /* pythagoras */ - - if(dsq > radiussq) /* closest point on ray outside sphere */ - return false; - - float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - - if(t < ray_t) { - *isect_t = t; - *isect_P = ray_P + ray_D*t; - return true; - } - } - - return false; -} - -ccl_device bool ray_aligned_disk_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 disk_P, float disk_radius, - float3 *isect_P, float *isect_t) -{ - /* aligned disk normal */ - float disk_t; - float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); - float div = dot(ray_D, disk_N); - - if(UNLIKELY(div == 0.0f)) - return false; - - /* compute t to intersection point */ - float t = -disk_t/div; - if(t < 0.0f || t > ray_t) - return false; - - /* test if within radius */ - float3 P = ray_P + ray_D*t; - if(len_squared(P - disk_P) > disk_radius*disk_radius) - return false; - - *isect_P = P; - *isect_t = t; - - return true; -} - -ccl_device bool ray_triangle_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float3 *isect_P, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_t = t; - *isect_P = ray_P + ray_D*t; - - return true; -} - -ccl_device_inline bool ray_triangle_intersect_uv( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float *isect_u, float *isect_v, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_u = u; - *isect_v = v; - *isect_t = t; - - return true; -} - -ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt, - float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n, - float3 *isect_P, float *isect_t, float *isect_u, float *isect_v) -{ - float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if(t < ray_mint || t > ray_maxt) - return false; - - float3 hit = ray_P + t*ray_D; - float3 inplane = hit - quad_P; - - float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; - if(u < 0.0f || u > 1.0f) - return false; - - float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; - if(v < 0.0f || v > 1.0f) - return false; - - if(isect_P) *isect_P = hit; - if(isect_t) *isect_t = t; - if(isect_u) *isect_u = u; - if(isect_v) *isect_v = v; - - return true; + return __int_as_float(__float_as_int(x) ^ y); } /* projections */ @@ -1690,4 +1519,3 @@ ccl_device_inline int util_max_axis(float3 vec) CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ - diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h new file mode 100644 index 00000000000..9e0587e1afb --- /dev/null +++ b/intern/cycles/util/util_math_intersect.h @@ -0,0 +1,326 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INTERSECT_H__ +#define __UTIL_MATH_INTERSECT_H__ + +CCL_NAMESPACE_BEGIN + +/* Ray Intersection */ + +ccl_device bool ray_sphere_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 sphere_P, float sphere_radius, + float3 *isect_P, float *isect_t) +{ + const float3 d = sphere_P - ray_P; + const float radiussq = sphere_radius*sphere_radius; + const float tsq = dot(d, d); + + if(tsq > radiussq) { + /* Ray origin outside sphere. */ + const float tp = dot(d, ray_D); + if(tp < 0.0f) { + /* Ray points away from sphere. */ + return false; + } + const float dsq = tsq - tp*tp; /* pythagoras */ + if(dsq > radiussq) { + /* Closest point on ray outside sphere. */ + return false; + } + const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ + if(t < ray_t) { + *isect_t = t; + *isect_P = ray_P + ray_D*t; + return true; + } + } + return false; +} + +ccl_device bool ray_aligned_disk_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 disk_P, float disk_radius, + float3 *isect_P, float *isect_t) +{ + /* Aligned disk normal. */ + float disk_t; + const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); + const float div = dot(ray_D, disk_N); + if(UNLIKELY(div == 0.0f)) { + return false; + } + /* Compute t to intersection point. */ + const float t = -disk_t/div; + if(t < 0.0f || t > ray_t) { + return false; + } + /* Test if within radius. */ + float3 P = ray_P + ray_D*t; + if(len_squared(P - disk_P) > disk_radius*disk_radius) { + return false; + } + *isect_P = P; + *isect_t = t; + return true; +} + +/* Optimized watertight ray-triangle intersection. + * + * Sven Woop + * Watertight Ray/Triangle Intersection + * + * http://jcgt.org/published/0002/01/05/paper.pdf + */ + +/* Precalculated data for the ray->tri intersection. */ +typedef struct TriangleIsectPrecalc { + /* Maximal dimension kz, and orthogonal dimensions. */ + int kx, ky, kz; + + /* Shear constants. */ + float Sx, Sy, Sz; +} TriangleIsectPrecalc; + +/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed + * component of float3 value. + */ +#ifdef __KERNEL_GPU__ +# define IDX(vec, idx) \ + ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) )) +#else +# define IDX(vec, idx) ((vec)[idx]) +#endif + +#if (defined(__KERNEL_OPENCL_APPLE__)) || \ + (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))) +ccl_device_noinline +#else +ccl_device_inline +#endif +void ray_triangle_intersect_precalc(float3 dir, + TriangleIsectPrecalc *isect_precalc) +{ + /* Calculate dimension where the ray direction is maximal. */ +#ifndef __KERNEL_SSE__ + int kz = util_max_axis(make_float3(fabsf(dir.x), + fabsf(dir.y), + fabsf(dir.z))); + int kx = kz + 1; if(kx == 3) kx = 0; + int ky = kx + 1; if(ky == 3) ky = 0; +#else + int kx, ky, kz; + /* Avoiding mispredicted branch on direction. */ + kz = util_max_axis(fabs(dir)); + static const char inc_xaxis[] = {1, 2, 0, 55}; + static const char inc_yaxis[] = {2, 0, 1, 55}; + kx = inc_xaxis[kz]; + ky = inc_yaxis[kz]; +#endif + + float dir_kz = IDX(dir, kz); + + /* Swap kx and ky dimensions to preserve winding direction of triangles. */ + if(dir_kz < 0.0f) { + int tmp = kx; + kx = ky; + ky = tmp; + } + + /* Calculate the shear constants. */ + float inv_dir_z = 1.0f / dir_kz; + isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; + isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; + isect_precalc->Sz = inv_dir_z; + + /* Store the dimensions. */ + isect_precalc->kx = kx; + isect_precalc->ky = ky; + isect_precalc->kz = kz; +} + +ccl_device_forceinline bool ray_triangle_intersect( + const TriangleIsectPrecalc *isect_precalc, + float3 ray_P, float ray_t, +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts, +#else + const float3 tri_a, const float3 tri_b, const float3 tri_c, +#endif + float *isect_u, float *isect_v, float *isect_t) +{ + const int kx = isect_precalc->kx; + const int ky = isect_precalc->ky; + const int kz = isect_precalc->kz; + const float Sx = isect_precalc->Sx; + const float Sy = isect_precalc->Sy; + const float Sz = isect_precalc->Sz; + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) + const avxf avxf_P(ray_P.m128, ray_P.m128); + const avxf tri_ab(_mm256_loadu_ps((float *)(ssef_verts))); + const avxf tri_bc(_mm256_loadu_ps((float *)(ssef_verts + 1))); + + const avxf AB = tri_ab - avxf_P; + const avxf BC = tri_bc - avxf_P; + + const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); + + const avxf AB_k = shuffle(AB, permute_mask); + const avxf BC_k = shuffle(BC, permute_mask); + + /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ + const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); + + /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ + const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); + + const avxf Sxy(Sy, Sx, Sy, Sx); + + /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ + const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); + + float ABBC_kz_array[8]; + _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); + + const float A_kz = ABBC_kz_array[0]; + const float B_kz = ABBC_kz_array[2]; + const float C_kz = ABBC_kz_array[6]; + + /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ + const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); + + const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); + + /* W U V + * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX + */ + const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */); + + const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask; + + /* Calculate scaled barycentric coordinates. */ + float WUVW_array[4]; + _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); + + const float W = WUVW_array[0]; + const float U = WUVW_array[1]; + const float V = WUVW_array[2]; + + const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); + const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, + _mm256_setzero_ps(), 0)); + + if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { + return false; + } +#else + /* Calculate vertices relative to ray origin. */ + const float3 A = make_float3(tri_a.x - ray_P.x, tri_a.y - ray_P.y, tri_a.z - ray_P.z); + const float3 B = make_float3(tri_b.x - ray_P.x, tri_b.y - ray_P.y, tri_b.z - ray_P.z); + const float3 C = make_float3(tri_c.x - ray_P.x, tri_c.y - ray_P.y, tri_c.z - ray_P.z); + + const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); + const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); + const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); + + /* Perform shear and scale of vertices. */ + const float Ax = A_kx - Sx * A_kz; + const float Ay = A_ky - Sy * A_kz; + const float Bx = B_kx - Sx * B_kz; + const float By = B_ky - Sy * B_kz; + const float Cx = C_kx - Sx * C_kz; + const float Cy = C_ky - Sy * C_kz; + + /* Calculate scaled barycentric coordinates. */ + float U = Cx * By - Cy * Bx; + float V = Ax * Cy - Ay * Cx; + float W = Bx * Ay - By * Ax; + if((U < 0.0f || V < 0.0f || W < 0.0f) && + (U > 0.0f || V > 0.0f || W > 0.0f)) + { + return false; + } +#endif + + /* Calculate determinant. */ + float det = U + V + W; + if(UNLIKELY(det == 0.0f)) { + return false; + } + + /* Calculate scaled z-coordinates of vertices and use them to calculate + * the hit distance. + */ + const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; + const int sign_det = (__float_as_int(det) & 0x80000000); + const float sign_T = xor_signmask(T, sign_det); + if((sign_T < 0.0f) || + (sign_T > ray_t * xor_signmask(det, sign_det))) + { + return false; + } + + /* Workaround precision error on CUDA. */ +#ifdef __KERNEL_CUDA__ + if(A == B && B == C) { + return false; + } +#endif + const float inv_det = 1.0f / det; + *isect_u = U * inv_det; + *isect_v = V * inv_det; + *isect_t = T * inv_det; + return true; +} + +#undef IDX + +ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, + float ray_mint, float ray_maxt, + float3 quad_P, + float3 quad_u, float3 quad_v, float3 quad_n, + float3 *isect_P, float *isect_t, + float *isect_u, float *isect_v) +{ + /* Perform intersection test. */ + float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); + if(t < ray_mint || t > ray_maxt) { + return false; + } + const float3 hit = ray_P + t*ray_D; + const float3 inplane = hit - quad_P; + const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; + if(u < 0.0f || u > 1.0f) { + return false; + } + const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; + if(v < 0.0f || v > 1.0f) { + return false; + } + /* Store the result. */ + /* TODO(sergey): Check whether we can avoid some checks here. */ + if(isect_P != NULL) *isect_P = hit; + if(isect_t != NULL) *isect_t = t; + if(isect_u != NULL) *isect_u = u; + if(isect_v != NULL) *isect_v = v; + return true; +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INTERSECT_H__ */ |