From 23098cda9936d785988b689ee69e58e900f17cb2 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Fri, 6 Oct 2017 21:47:41 +0200 Subject: Code refactor: make texture code more consistent between devices. * Use common TextureInfo struct for all devices, except CUDA fermi. * Move image sampling code to kernels/*/kernel_*_image.h files. * Use arrays for data textures on Fermi too, so device_vector works. --- intern/cycles/device/device.h | 1 + intern/cycles/device/device_cpu.cpp | 68 ++- intern/cycles/device/device_cuda.cpp | 142 +++--- intern/cycles/device/opencl/opencl.h | 11 +- intern/cycles/device/opencl/opencl_base.cpp | 53 +-- intern/cycles/device/opencl/opencl_split.cpp | 10 +- intern/cycles/kernel/CMakeLists.txt | 4 +- intern/cycles/kernel/geom/geom_volume.h | 42 +- intern/cycles/kernel/kernel_compat_cpu.h | 449 +------------------ intern/cycles/kernel/kernel_compat_cuda.h | 40 +- intern/cycles/kernel/kernel_compat_opencl.h | 2 +- intern/cycles/kernel/kernel_globals.h | 31 +- intern/cycles/kernel/kernel_image_opencl.h | 252 ----------- intern/cycles/kernel/kernel_textures.h | 83 ++-- intern/cycles/kernel/kernels/cpu/kernel.cpp | 122 +----- .../cycles/kernel/kernels/cpu/kernel_cpu_image.h | 488 +++++++++++++++++++-- intern/cycles/kernel/kernels/cuda/kernel.cu | 1 + .../cycles/kernel/kernels/cuda/kernel_cuda_image.h | 175 ++++++++ intern/cycles/kernel/kernels/opencl/kernel.cl | 2 +- .../kernel/kernels/opencl/kernel_opencl_image.h | 229 ++++++++++ intern/cycles/kernel/osl/osl_services.cpp | 4 +- intern/cycles/kernel/split/kernel_split_common.h | 5 +- intern/cycles/kernel/svm/svm_image.h | 128 ------ intern/cycles/kernel/svm/svm_voxel.h | 25 +- intern/cycles/util/util_texture.h | 56 ++- intern/cycles/util/util_types.h | 46 -- 26 files changed, 1146 insertions(+), 1323 deletions(-) delete mode 100644 intern/cycles/kernel/kernel_image_opencl.h create mode 100644 intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h create mode 100644 intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h (limited to 'intern/cycles') diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 26d6d380a10..0e0a0079209 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -26,6 +26,7 @@ #include "util/util_stats.h" #include "util/util_string.h" #include "util/util_thread.h" +#include "util/util_texture.h" #include "util/util_types.h" #include "util/util_vector.h" diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 19e3c0a9075..ac6d3246d38 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -163,6 +163,9 @@ public: TaskPool task_pool; KernelGlobals kernel_globals; + device_vector texture_info; + bool need_texture_info; + #ifdef WITH_OSL OSLGlobals osl_globals; #endif @@ -235,6 +238,8 @@ public: VLOG(1) << "Will be using split kernel."; } + need_texture_info = false; + #define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions(KERNEL_FUNCTIONS(name)) REGISTER_SPLIT_KERNEL(path_init); REGISTER_SPLIT_KERNEL(scene_intersect); @@ -261,6 +266,7 @@ public: ~CPUDevice() { task_pool.stop(); + tex_free(texture_info); } virtual bool show_samples() const @@ -268,6 +274,15 @@ public: return (TaskScheduler::num_threads() == 1); } + void load_texture_info() + { + if(need_texture_info) { + tex_free(texture_info); + tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_texture_info = false; + } + } + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { if(name) { @@ -333,14 +348,47 @@ public: VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - kernel_tex_copy(&kernel_globals, - name, - mem.data_pointer, - mem.data_width, - mem.data_height, - mem.data_depth, - interpolation, - extension); + + if(interpolation == INTERPOLATION_NONE) { + /* Data texture. */ + kernel_tex_copy(&kernel_globals, + name, + mem.data_pointer, + mem.data_width, + mem.data_height, + mem.data_depth, + interpolation, + extension); + } + else { + /* Image Texture. */ + int flat_slot = 0; + if(string_startswith(name, "__tex_image")) { + int pos = string(name).rfind("_"); + flat_slot = atoi(name + pos + 1); + } + else { + assert(0); + } + + if(flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + TextureInfo& info = texture_info.get_data()[flat_slot]; + info.data = (uint64_t)mem.data_pointer; + info.cl_buffer = 0; + info.interpolation = interpolation; + info.extension = extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + need_texture_info = true; + } + mem.device_pointer = mem.data_pointer; mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); @@ -352,6 +400,7 @@ public: mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; + need_texture_info = true; } } @@ -784,6 +833,9 @@ public: void task_add(DeviceTask& task) { + /* Load texture info. */ + load_texture_info(); + /* split task into smaller ones */ list tasks; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 734edcff503..dcbe6033bcc 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -129,7 +129,7 @@ public: CUcontext cuContext; CUmodule cuModule, cuFilterModule; map tex_interp_map; - map tex_bindless_map; + map tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; @@ -145,8 +145,8 @@ public: map pixel_mem_map; /* Bindless Textures */ - device_vector bindless_mapping; - bool need_bindless_mapping; + device_vector texture_info; + bool need_texture_info; CUdeviceptr cuda_device_ptr(device_ptr mem) { @@ -231,7 +231,7 @@ public: split_kernel = NULL; - need_bindless_mapping = false; + need_texture_info = false; /* intialize */ if(cuda_error(cuInit(0))) @@ -274,7 +274,7 @@ public: delete split_kernel; if(info.has_bindless_textures) { - tex_free(bindless_mapping); + tex_free(texture_info); } cuda_assert(cuCtxDestroy(cuContext)); @@ -544,12 +544,12 @@ public: return (result == CUDA_SUCCESS); } - void load_bindless_mapping() + void load_texture_info() { - if(info.has_bindless_textures && need_bindless_mapping) { - tex_free(bindless_mapping); - tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); - need_bindless_mapping = false; + if(info.has_bindless_textures && need_texture_info) { + tex_free(texture_info); + tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_texture_info = false; } } @@ -646,8 +646,7 @@ public: << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - /* Check if we are on sm_30 or above. - * We use arrays and bindles textures for storage there */ + /* Check if we are on sm_30 or above, for bindless textures. */ bool has_bindless_textures = info.has_bindless_textures; /* General variables for both architectures */ @@ -679,20 +678,10 @@ public: filter_mode = CU_TR_FILTER_MODE_LINEAR; } - CUarray_format_enum format; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; - default: assert(0); return; - } - /* General variables for Fermi */ CUtexref texref = NULL; - if(!has_bindless_textures) { + if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) { if(mem.data_depth > 1) { /* Kernel uses different bind names for 2d and 3d float textures, * so we have to adjust couple of things here. @@ -711,41 +700,41 @@ public: } } - /* Data Storage */ if(interpolation == INTERPOLATION_NONE) { - if(has_bindless_textures) { - mem_alloc(NULL, mem, MEM_READ_ONLY); - mem_copy_to(mem); + /* Data Storage */ + mem_alloc(NULL, mem, MEM_READ_ONLY); + mem_copy_to(mem); - CUdeviceptr cumem; - size_t cubytes; + CUdeviceptr cumem; + size_t cubytes; - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } else { - mem_alloc(NULL, mem, MEM_READ_ONLY); - mem_copy_to(mem); - - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } } - /* Texture Storage */ else { + /* Texture Storage */ CUarray handle = NULL; + CUarray_format_enum format; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; + default: assert(0); return; + } + if(mem.data_depth > 1) { CUDA_ARRAY3D_DESCRIPTOR desc; @@ -810,8 +799,8 @@ public: stats.mem_alloc(size); - /* Bindless Textures - Kepler */ if(has_bindless_textures) { + /* Bindless Textures - Kepler */ int flat_slot = 0; if(string_startswith(name, "__tex_image")) { int pos = string(name).rfind("_"); @@ -844,35 +833,39 @@ public: } /* Resize once */ - if(flat_slot >= bindless_mapping.size()) { + if(flat_slot >= texture_info.size()) { /* Allocate some slots in advance, to reduce amount - * of re-allocations. - */ - bindless_mapping.resize(flat_slot + 128); + * of re-allocations. */ + texture_info.resize(flat_slot + 128); } /* Set Mapping and tag that we need to (re-)upload to device */ - bindless_mapping.get_data()[flat_slot] = (uint)tex; - tex_bindless_map[mem.device_pointer] = (uint)tex; - need_bindless_mapping = true; + TextureInfo& info = texture_info.get_data()[flat_slot]; + info.data = (uint64_t)tex; + info.cl_buffer = 0; + info.interpolation = interpolation; + info.extension = extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + tex_bindless_map[mem.device_pointer] = tex; + need_texture_info = true; } - /* Regular Textures - Fermi */ else { + /* Regular Textures - Fermi */ cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - } - } - /* Fermi, Data and Image Textures */ - if(!has_bindless_textures) { - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); - if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); - } + cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); + cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); + } - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + } } /* Fermi and Kepler */ @@ -888,8 +881,8 @@ public: /* Free CUtexObject (Bindless Textures) */ if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { - uint flat_slot = tex_bindless_map[mem.device_pointer]; - cuTexObjectDestroy(flat_slot); + CUtexObject tex = tex_bindless_map[mem.device_pointer]; + cuTexObjectDestroy(tex); } tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); @@ -1716,9 +1709,6 @@ public: if(task->type == DeviceTask::RENDER) { RenderTile tile; - /* Upload Bindless Mapping */ - load_bindless_mapping(); - DeviceRequestedFeatures requested_features; if(use_split_kernel()) { if(!use_adaptive_compilation()) { @@ -1759,9 +1749,6 @@ public: } } else if(task->type == DeviceTask::SHADER) { - /* Upload Bindless Mapping */ - load_bindless_mapping(); - shader(*task); cuda_assert(cuCtxSynchronize()); @@ -1784,9 +1771,12 @@ public: void task_add(DeviceTask& task) { - if(task.type == DeviceTask::FILM_CONVERT) { - CUDAContextScope scope(this); + CUDAContextScope scope(this); + /* Load texture info. */ + load_texture_info(); + + if(task.type == DeviceTask::FILM_CONVERT) { /* must be done in main thread due to opengl access */ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); cuda_assert(cuCtxSynchronize()); diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 26bf4a9af5b..bd956e29083 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -545,15 +545,10 @@ private: MemoryManager memory_manager; friend class MemoryManager; - struct tex_info_t { - uint buffer, padding; - cl_ulong offset; - uint width, height, depth, options; - }; - static_assert_align(tex_info_t, 16); + static_assert_align(TextureInfo, 16); - vector texture_descriptors; - device_memory texture_descriptors_buffer; + vector texture_info; + device_memory texture_info_buffer; struct Texture { Texture() {} diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 3db3efd1103..486ef89d22e 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -136,11 +136,11 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou return; } - /* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */ - texture_descriptors.resize(1); - texture_descriptors_buffer.resize(1); - texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; - memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + /* Allocate this right away so that texture_info_buffer is placed at offset 0 in the device memory buffers */ + texture_info.resize(1); + texture_info_buffer.resize(1); + texture_info_buffer.data_pointer = (device_ptr)&texture_info[0]; + memory_manager.alloc("texture_info", texture_info_buffer); fprintf(stderr, "Device init success\n"); device_initialized = true; @@ -625,7 +625,7 @@ void OpenCLDeviceBase::flush_texture_buffers() vector texture_slots; -#define KERNEL_TEX(type, ttype, name) \ +#define KERNEL_TEX(type, name) \ if(textures.find(#name) != textures.end()) { \ texture_slots.push_back(texture_slot_t(#name, num_slots)); \ } \ @@ -647,55 +647,38 @@ void OpenCLDeviceBase::flush_texture_buffers() } /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_descriptors_buffer); + memory_manager.free(texture_info_buffer); - texture_descriptors.resize(num_slots); - texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t)); - texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; + texture_info.resize(num_slots); + texture_info_buffer.resize(num_slots * sizeof(TextureInfo)); + texture_info_buffer.data_pointer = (device_ptr)&texture_info[0]; - memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + memory_manager.alloc("texture_info", texture_info_buffer); /* Fill in descriptors */ foreach(texture_slot_t& slot, texture_slots) { Texture& tex = textures[slot.name]; - tex_info_t& info = texture_descriptors[slot.slot]; + TextureInfo& info = texture_info[slot.slot]; MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - info.offset = desc.offset; - info.buffer = desc.device_buffer; + info.data = desc.offset; + info.cl_buffer = desc.device_buffer; if(string_startswith(slot.name, "__tex_image")) { info.width = tex.mem->data_width; info.height = tex.mem->data_height; info.depth = tex.mem->data_depth; - info.options = 0; - - if(tex.interpolation == INTERPOLATION_CLOSEST) { - info.options |= (1 << 0); - } - - switch(tex.extension) { - case EXTENSION_REPEAT: - info.options |= (1 << 1); - break; - case EXTENSION_EXTEND: - info.options |= (1 << 2); - break; - case EXTENSION_CLIP: - info.options |= (1 << 3); - break; - default: - break; - } + info.interpolation = tex.interpolation; + info.extension = tex.extension; } } /* Force write of descriptors. */ - memory_manager.free(texture_descriptors_buffer); - memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + memory_manager.free(texture_info_buffer); + memory_manager.alloc("texture_info", texture_info_buffer); } void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 976cc9df46d..b4e9419ebbd 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -117,14 +117,8 @@ public: ccl_constant KernelData *data; ccl_global char *buffers[8]; - typedef struct _tex_info_t { - uint buffer, padding; - uint64_t offset; - uint width, height, depth, options; - } _tex_info_t; - -#define KERNEL_TEX(type, ttype, name) \ - _tex_info_t name; +#define KERNEL_TEX(type, name) \ + TextureInfo name; #include "kernel/kernel_textures.h" #undef KERNEL_TEX diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index b10dd05cb9b..1b7a657214a 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -83,7 +83,6 @@ set(SRC_HEADERS kernel_emission.h kernel_film.h kernel_globals.h - kernel_image_opencl.h kernel_jitter.h kernel_light.h kernel_math.h @@ -119,10 +118,12 @@ set(SRC_KERNELS_CPU_HEADERS set(SRC_KERNELS_CUDA_HEADERS kernels/cuda/kernel_config.h + kernels/cuda/kernel_cuda_image.h ) set(SRC_KERNELS_OPENCL_HEADERS kernels/opencl/kernel_split_function.h + kernels/opencl/kernel_opencl_image.h ) set(SRC_CLOSURE_HEADERS @@ -507,6 +508,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_opencl_image.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 698cd6b03fd..b19c488ef8a 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -29,21 +29,6 @@ CCL_NAMESPACE_BEGIN /* Return position normalized to 0..1 in mesh bounds */ -#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 -ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) -{ - float4 r; - switch(id) { - case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break; - case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break; - case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break; - case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break; - case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break; - } - return r; -} -#endif /* __KERNEL_CUDA__ */ - ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P) @@ -65,22 +50,14 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z); - float4 r = make_float4(f, f, f, 1.0f); -# else - float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z); -# endif -#elif defined(__KERNEL_OPENCL__) +#ifdef __KERNEL_GPU__ float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) - r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); + r = kernel_tex_image_interp_3d_ex(kg, desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); else - r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z); + r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); #endif if(dx) *dx = 0.0f; @@ -92,21 +69,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy) { float3 P = volume_normalized_position(kg, sd, sd->P); -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z); -# else - float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z); -# endif -#elif defined(__KERNEL_OPENCL__) +#ifdef __KERNEL_GPU__ float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) - r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); + r = kernel_tex_image_interp_3d_ex(kg, desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); else - r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z); + r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); #endif if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 93934ee6b38..6f63c8f77a2 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -74,7 +74,7 @@ CCL_NAMESPACE_BEGIN * pointer lookup. */ template struct texture { - ccl_always_inline T fetch(int index) + ccl_always_inline const T& fetch(int index) { kernel_assert(index >= 0 && index < width); return data[index]; @@ -112,449 +112,6 @@ template struct texture { int width; }; -template struct texture_image { -#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ - u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ - u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void)0 - - ccl_always_inline float4 read(float4 r) - { - return r; - } - - ccl_always_inline float4 read(uchar4 r) - { - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - - ccl_always_inline float4 read(uchar r) - { - float f = r*(1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline float4 read(float r) - { - /* TODO(dingto): Optimize this, so interpolation - * happens on float instead of float4 */ - return make_float4(r, r, r, 1.0f); - } - - ccl_always_inline float4 read(half4 r) - { - return half4_to_float4(r); - } - - ccl_always_inline float4 read(half r) - { - float f = half_to_float(r); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline int wrap_periodic(int x, int width) - { - x %= width; - if(x < 0) - x += width; - return x; - } - - ccl_always_inline int wrap_clamp(int x, int width) - { - return clamp(x, 0, width-1); - } - - ccl_always_inline float frac(float x, int *ix) - { - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; - } - - ccl_always_inline float4 interp(float x, float y) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - int ix, iy, nix, niy; - - if(interpolation == INTERPOLATION_CLOSEST) { - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return read(data[ix + iy*width]); - } - else if(interpolation == INTERPOLATION_LINEAR) { - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); - r += (1.0f - ty)*tx*read(data[nix + iy*width]); - r += ty*(1.0f - tx)*read(data[ix + niy*width]); - r += ty*tx*read(data[nix + niy*width]); - - return r; - } - else { - /* Bicubic b-spline interpolation. */ - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - int pix, piy, nnix, nniy; - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - float u[4], v[4]; - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y) (read(data[xc[x] + yc[y]])) -#define TERM(col) \ - (v[col] * (u[0] * DATA(0, col) + \ - u[1] * DATA(1, col) + \ - u[2] * DATA(2, col) + \ - u[3] * DATA(3, col))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - - /* Actual interpolation. */ - return TERM(0) + TERM(1) + TERM(2) + TERM(3); - -#undef TERM -#undef DATA - } - } - - ccl_always_inline float4 interp_3d(float x, float y, float z) - { - return interp_3d_ex(x, y, z, interpolation); - } - - ccl_always_inline float4 interp_3d_ex_closest(float x, float y, float z) - { - int ix, iy, iz; - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - frac(z*(float)depth, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - return read(data[ix + iy*width + iz*width*height]); - } - - ccl_always_inline float4 interp_3d_ex_linear(float x, float y, float z) - { - int ix, iy, iz; - int nix, niy, niz; - - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - float tz = frac(z*(float)depth - 0.5f, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r; - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); - r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); - r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); - r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); - - r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); - r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); - r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); - r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); - - return r; - } - - /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are - * causing stack overflow issue in this function unless it is inlined. - * - * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization - * enabled. - */ -#ifdef __GNUC__ - ccl_always_inline -#else - ccl_never_inline -#endif - float4 interp_3d_ex_tricubic(float x, float y, float z) - { - int ix, iy, iz; - int nix, niy, niz; - /* Tricubic b-spline interpolation. */ - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - const float tz = frac(z*(float)depth - 0.5f, &iz); - int pix, piy, piz, nnix, nniy, nniz; - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - piz = wrap_periodic(iz-1, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - nniz = wrap_periodic(iz+2, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - ATTR_FALLTHROUGH; - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - piz = wrap_clamp(iz-1, depth); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - nniz = wrap_clamp(iz+2, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - const int zc[4] = {width * height * piz, - width * height * iz, - width * height * niz, - width * height * nniz}; - float u[4], v[4], w[4]; - - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) -#define COL_TERM(col, row) \ - (v[col] * (u[0] * DATA(0, col, row) + \ - u[1] * DATA(1, col, row) + \ - u[2] * DATA(2, col, row) + \ - u[3] * DATA(3, col, row))) -#define ROW_TERM(row) \ - (w[row] * (COL_TERM(0, row) + \ - COL_TERM(1, row) + \ - COL_TERM(2, row) + \ - COL_TERM(3, row))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); - - /* Actual interpolation. */ - return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); - -#undef COL_TERM -#undef ROW_TERM -#undef DATA - } - - ccl_always_inline float4 interp_3d_ex(float x, float y, float z, - int interpolation = INTERPOLATION_LINEAR) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - switch(interpolation) { - case INTERPOLATION_CLOSEST: - return interp_3d_ex_closest(x, y, z); - case INTERPOLATION_LINEAR: - return interp_3d_ex_linear(x, y, z); - default: - return interp_3d_ex_tricubic(x, y, z); - } - } - - ccl_always_inline void dimensions_set(int width_, int height_, int depth_) - { - width = width_; - height = height_; - depth = depth_; - } - - T *data; - int interpolation; - ExtensionType extension; - int width, height, depth; -#undef SET_CUBIC_SPLINE_WEIGHTS -}; - -typedef texture texture_float4; -typedef texture texture_float2; -typedef texture texture_float; -typedef texture texture_uint; -typedef texture texture_int; -typedef texture texture_uint4; -typedef texture texture_uchar4; -typedef texture texture_uchar; -typedef texture_image texture_image_float; -typedef texture_image texture_image_uchar; -typedef texture_image texture_image_half; -typedef texture_image texture_image_float4; -typedef texture_image texture_image_uchar4; -typedef texture_image texture_image_half4; - /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) @@ -563,10 +120,6 @@ typedef texture_image texture_image_half4; #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) -#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y) -#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z) -#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation) - #define kernel_data (kg->__data) #ifdef __KERNEL_SSE2__ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 2e8ca48c413..fa512f80e41 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -126,42 +126,16 @@ ccl_device_inline uint ccl_num_groups(uint d) /* Textures */ -typedef texture texture_float4; -typedef texture texture_float2; -typedef texture texture_float; -typedef texture texture_uint; -typedef texture texture_int; -typedef texture texture_uint4; -typedef texture texture_uchar; -typedef texture texture_uchar4; +/* Use arrays for regular data. This is a little slower than textures on Fermi, + * but allows for cleaner code and we will stop supporting Fermi soon. */ +#define kernel_tex_fetch(t, index) t[(index)] + +/* On Kepler (6xx) and above, we use Bindless Textures for images. + * On Fermi cards (4xx and 5xx), we have to use regular textures. */ +#if __CUDA_ARCH__ < 300 typedef texture texture_image_float4; typedef texture texture_image3d_float4; typedef texture texture_image_uchar4; - -/* Macros to handle different memory storage on different devices */ - -/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. - * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. - * - * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. - * Using Arrays on Fermi turned out to be slower.*/ - -/* Fermi */ -#if __CUDA_ARCH__ < 300 -# define __KERNEL_CUDA_TEX_STORAGE__ -# define kernel_tex_fetch(t, index) tex1Dfetch(t, index) - -# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) - -/* Kepler */ -#else -# define kernel_tex_fetch(t, index) t[(index)] - -# define kernel_tex_image_interp_float4(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_float(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D(t, x, y, z) -# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D(t, x, y, z) #endif #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 7f81523791b..b02e3bc576d 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -144,7 +144,7 @@ /* data lookup defines */ #define kernel_data (*kg->data) -#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)] +#define kernel_tex_fetch(tex, index) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))[(index)] /* define NULL */ #define NULL 0 diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 9d55183d94b..97d4726407b 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -46,14 +46,7 @@ struct Intersection; struct VolumeStep; typedef struct KernelGlobals { - vector texture_float4_images; - vector texture_byte4_images; - vector texture_half4_images; - vector texture_float_images; - vector texture_byte_images; - vector texture_half_images; - -# define KERNEL_TEX(type, ttype, name) ttype name; +# define KERNEL_TEX(type, name) texture name; # define KERNEL_IMAGE_TEX(type, ttype, name) # include "kernel/kernel_textures.h" @@ -99,11 +92,7 @@ typedef struct KernelGlobals { Intersection hits_stack[64]; } KernelGlobals; -# ifdef __KERNEL_CUDA_TEX_STORAGE__ -# define KERNEL_TEX(type, ttype, name) ttype name; -# else -# define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name; -# endif +# define KERNEL_TEX(type, name) const __constant__ __device__ type *name; # define KERNEL_IMAGE_TEX(type, ttype, name) ttype name; # include "kernel/kernel_textures.h" @@ -113,22 +102,16 @@ typedef struct KernelGlobals { #ifdef __KERNEL_OPENCL__ -# define KERNEL_TEX(type, ttype, name) \ +# define KERNEL_TEX(type, name) \ typedef type name##_t; # include "kernel/kernel_textures.h" -typedef struct tex_info_t { - uint buffer, padding; - uint64_t offset; - uint width, height, depth, options; -} tex_info_t; - typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; ccl_global char *buffers[8]; -# define KERNEL_TEX(type, ttype, name) \ - tex_info_t name; +# define KERNEL_TEX(type, name) \ + TextureInfo name; # include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ @@ -176,9 +159,9 @@ ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) if(ccl_local_id(0) + ccl_local_id(1) == 0) # endif { - ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0]; + ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0]; -# define KERNEL_TEX(type, ttype, name) \ +# define KERNEL_TEX(type, name) \ kg->name = *(info++); # include "kernel/kernel_textures.h" } diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h deleted file mode 100644 index 9e3373432ec..00000000000 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright 2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/* For OpenCL we do manual lookup and interpolation. */ - -ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) { - const uint tex_offset = id -#define KERNEL_TEX(type, ttype, name) + 1 -#include "kernel/kernel_textures.h" - ; - - return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset]; -} - -#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)] - -ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) -{ - const ccl_global tex_info_t *info = kernel_tex_info(kg, id); - const int texture_type = kernel_tex_type(id); - - /* Float4 */ - if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { - return tex_fetch(float4, info, offset); - } - /* Byte4 */ - else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { - uchar4 r = tex_fetch(uchar4, info, offset); - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - /* Float */ - else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { - float f = tex_fetch(float, info, offset); - return make_float4(f, f, f, 1.0f); - } - /* Byte */ - else { - uchar r = tex_fetch(uchar, info, offset); - float f = r * (1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } -} - -ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) -{ - x %= width; - if(x < 0) - x += width; - return x; -} - -ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) -{ - return clamp(x, 0, width-1); -} - -ccl_device_inline float svm_image_texture_frac(float x, int *ix) -{ - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; -} - -ccl_device_inline uint kernel_decode_image_interpolation(uint info) -{ - return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; -} - -ccl_device_inline uint kernel_decode_image_extension(uint info) -{ - if(info & (1 << 1)) { - return EXTENSION_REPEAT; - } - else if(info & (1 << 2)) { - return EXTENSION_EXTEND; - } - else { - return EXTENSION_CLIP; - } -} - -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) -{ - const ccl_global tex_info_t *info = kernel_tex_info(kg, id); - - uint width = info->width; - uint height = info->height; - uint offset = 0; - - /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info->options); - uint extension = kernel_decode_image_extension(info->options); - - /* Actual sampling. */ - float4 r; - int ix, iy, nix, niy; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = svm_image_texture_read(kg, id, offset + ix + iy*width); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); - r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); - r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); - r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); - } - return r; -} - - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) -{ - const ccl_global tex_info_t *info = kernel_tex_info(kg, id); - - uint width = info->width; - uint height = info->height; - uint offset = 0; - uint depth = info->depth; - - /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info->options); - uint extension = kernel_decode_image_extension(info->options); - - /* Actual sampling. */ - float4 r; - int ix, iy, iz, nix, niy, niz; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - svm_image_texture_frac(z*depth, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - niz = svm_image_texture_wrap_periodic(iz+1, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - niz = svm_image_texture_wrap_clamp(iz+1, depth); - - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height); - r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height); - r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height); - - r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height); - r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); - r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); - r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); - } - return r; -} diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 5eab28a2953..344b2223573 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -15,7 +15,7 @@ */ #ifndef KERNEL_TEX -# define KERNEL_TEX(type, ttype, name) +# define KERNEL_TEX(type, name) #endif #ifndef KERNEL_IMAGE_TEX @@ -23,63 +23,65 @@ #endif /* bvh */ -KERNEL_TEX(float4, texture_float4, __bvh_nodes) -KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) -KERNEL_TEX(float4, texture_float4, __prim_tri_verts) -KERNEL_TEX(uint, texture_uint, __prim_tri_index) -KERNEL_TEX(uint, texture_uint, __prim_type) -KERNEL_TEX(uint, texture_uint, __prim_visibility) -KERNEL_TEX(uint, texture_uint, __prim_index) -KERNEL_TEX(uint, texture_uint, __prim_object) -KERNEL_TEX(uint, texture_uint, __object_node) -KERNEL_TEX(float2, texture_float2, __prim_time) +KERNEL_TEX(float4, __bvh_nodes) +KERNEL_TEX(float4, __bvh_leaf_nodes) +KERNEL_TEX(float4, __prim_tri_verts) +KERNEL_TEX(uint, __prim_tri_index) +KERNEL_TEX(uint, __prim_type) +KERNEL_TEX(uint, __prim_visibility) +KERNEL_TEX(uint, __prim_index) +KERNEL_TEX(uint, __prim_object) +KERNEL_TEX(uint, __object_node) +KERNEL_TEX(float2, __prim_time) /* objects */ -KERNEL_TEX(float4, texture_float4, __objects) -KERNEL_TEX(float4, texture_float4, __objects_vector) +KERNEL_TEX(float4, __objects) +KERNEL_TEX(float4, __objects_vector) /* triangles */ -KERNEL_TEX(uint, texture_uint, __tri_shader) -KERNEL_TEX(float4, texture_float4, __tri_vnormal) -KERNEL_TEX(uint4, texture_uint4, __tri_vindex) -KERNEL_TEX(uint, texture_uint, __tri_patch) -KERNEL_TEX(float2, texture_float2, __tri_patch_uv) +KERNEL_TEX(uint, __tri_shader) +KERNEL_TEX(float4, __tri_vnormal) +KERNEL_TEX(uint4, __tri_vindex) +KERNEL_TEX(uint, __tri_patch) +KERNEL_TEX(float2, __tri_patch_uv) /* curves */ -KERNEL_TEX(float4, texture_float4, __curves) -KERNEL_TEX(float4, texture_float4, __curve_keys) +KERNEL_TEX(float4, __curves) +KERNEL_TEX(float4, __curve_keys) /* patches */ -KERNEL_TEX(uint, texture_uint, __patches) +KERNEL_TEX(uint, __patches) /* attributes */ -KERNEL_TEX(uint4, texture_uint4, __attributes_map) -KERNEL_TEX(float, texture_float, __attributes_float) -KERNEL_TEX(float4, texture_float4, __attributes_float3) -KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4) +KERNEL_TEX(uint4, __attributes_map) +KERNEL_TEX(float, __attributes_float) +KERNEL_TEX(float4, __attributes_float3) +KERNEL_TEX(uchar4, __attributes_uchar4) /* lights */ -KERNEL_TEX(float4, texture_float4, __light_distribution) -KERNEL_TEX(float4, texture_float4, __light_data) -KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf) -KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf) +KERNEL_TEX(float4, __light_distribution) +KERNEL_TEX(float4, __light_data) +KERNEL_TEX(float2, __light_background_marginal_cdf) +KERNEL_TEX(float2, __light_background_conditional_cdf) /* particles */ -KERNEL_TEX(float4, texture_float4, __particles) +KERNEL_TEX(float4, __particles) /* shaders */ -KERNEL_TEX(uint4, texture_uint4, __svm_nodes) -KERNEL_TEX(uint, texture_uint, __shader_flag) -KERNEL_TEX(uint, texture_uint, __object_flag) +KERNEL_TEX(uint4, __svm_nodes) +KERNEL_TEX(uint, __shader_flag) +KERNEL_TEX(uint, __object_flag) /* lookup tables */ -KERNEL_TEX(float, texture_float, __lookup_table) +KERNEL_TEX(float, __lookup_table) /* sobol */ -KERNEL_TEX(uint, texture_uint, __sobol_directions) +KERNEL_TEX(uint, __sobol_directions) -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ < 300 +#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ >= 300 +/* image textures */ +KERNEL_TEX(TextureInfo, __texture_info) +#else /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) @@ -180,12 +182,7 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) - -# else -/* bindless textures */ -KERNEL_TEX(uint, texture_uint, __bindless_mapping) -# endif /* __CUDA_ARCH__ */ -#endif /* __KERNEL_CUDA__ */ +#endif /* defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 */ #undef KERNEL_TEX #undef KERNEL_IMAGE_TEX diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 998619ac897..7679ab4f111 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -84,130 +84,16 @@ void kernel_tex_copy(KernelGlobals *kg, if(0) { } -#define KERNEL_TEX(type, ttype, tname) \ +#define KERNEL_TEX(type, tname) \ else if(strcmp(name, #tname) == 0) { \ kg->tname.data = (type*)mem; \ kg->tname.width = width; \ } -#define KERNEL_IMAGE_TEX(type, ttype, tname) +#define KERNEL_IMAGE_TEX(type, tname) #include "kernel/kernel_textures.h" - - else if(strstr(name, "__tex_image_float4")) { - texture_image_float4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_float4_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_float4_images.size()) { - kg->texture_float4_images.resize(array_index+1); - } - tex = &kg->texture_float4_images[array_index]; - } - - if(tex) { - tex->data = (float4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_float")) { - texture_image_float *tex = NULL; - int id = atoi(name + strlen("__tex_image_float_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_float_images.size()) { - kg->texture_float_images.resize(array_index+1); - } - tex = &kg->texture_float_images[array_index]; - } - - if(tex) { - tex->data = (float*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte4")) { - texture_image_uchar4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte4_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_byte4_images.size()) { - kg->texture_byte4_images.resize(array_index+1); - } - tex = &kg->texture_byte4_images[array_index]; - } - - if(tex) { - tex->data = (uchar4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte")) { - texture_image_uchar *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_byte_images.size()) { - kg->texture_byte_images.resize(array_index+1); - } - tex = &kg->texture_byte_images[array_index]; - } - - if(tex) { - tex->data = (uchar*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half4")) { - texture_image_half4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_half4_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_half4_images.size()) { - kg->texture_half4_images.resize(array_index+1); - } - tex = &kg->texture_half4_images[array_index]; - } - - if(tex) { - tex->data = (half4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half")) { - texture_image_half *tex = NULL; - int id = atoi(name + strlen("__tex_image_half_")); - int array_index = kernel_tex_index(id); - - if(array_index >= 0) { - if(array_index >= kg->texture_half_images.size()) { - kg->texture_half_images.resize(array_index+1); - } - tex = &kg->texture_half_images[array_index]; - } - - if(tex) { - tex->data = (half*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else + else { assert(0); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index f6bb4c25012..b2ad60f08c1 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -17,70 +17,500 @@ #ifndef __KERNEL_CPU_IMAGE_H__ #define __KERNEL_CPU_IMAGE_H__ -#ifdef __KERNEL_CPU__ - CCL_NAMESPACE_BEGIN -ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y) +template struct TextureInterpolator { +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + + static ccl_always_inline float4 read(float4 r) + { + return r; + } + + static ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + + static ccl_always_inline float4 read(uchar r) + { + float f = r*(1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(float r) + { + /* TODO(dingto): Optimize this, so interpolation + * happens on float instead of float4 */ + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(half4 r) + { + return half4_to_float4(r); + } + + static ccl_always_inline float4 read(half r) + { + float f = half_to_float(r); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if(x < 0) + x += width; + return x; + } + + static ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width-1); + } + + static ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; + } + + static ccl_always_inline float4 interp(const TextureInfo& info, float x, float y) + { + if(UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + const T *data = (const T*)info.data; + int width = info.width; + int height = info.height; + int ix, iy, nix, niy; + + if(info.interpolation == INTERPOLATION_CLOSEST) { + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[ix + iy*width]); + } + else if(info.interpolation == INTERPOLATION_LINEAR) { + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); + r += (1.0f - ty)*tx*read(data[nix + iy*width]); + r += ty*(1.0f - tx)*read(data[ix + niy*width]); + r += ty*tx*read(data[nix + niy*width]); + + return r; + } + else { + /* Bicubic b-spline interpolation. */ + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data[xc[x] + yc[y]])) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); + +#undef TERM +#undef DATA + } + } + + static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + frac(z*(float)depth, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + return read(data[ix + iy*width + iz*width*height]); + } + + static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + float tz = frac(z*(float)depth - 0.5f, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + float4 r; + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); + r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); + r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); + r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); + + r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); + r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); + r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); + r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); + + return r; + } + + /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are + * causing stack overflow issue in this function unless it is inlined. + * + * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization + * enabled. + */ +#ifdef __GNUC__ + static ccl_always_inline +#else + static ccl_never_inline +#endif + float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + const T *data = (const T*)info.data; + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + + static ccl_always_inline float4 interp_3d(const TextureInfo& info, + float x, float y, float z, + int interpolation = INTERPOLATION_LINEAR) + { + if(UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + switch(interpolation) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(info, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(info, x, y, z); + default: + return interp_3d_tricubic(info, x, y, z); + } + } +#undef SET_CUBIC_SPLINE_WEIGHTS +}; + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - switch(kernel_tex_type(tex)) { + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + + switch(kernel_tex_type(id)) { case IMAGE_DATA_TYPE_HALF: - return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); case IMAGE_DATA_TYPE_BYTE: - return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); case IMAGE_DATA_TYPE_FLOAT: - return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); case IMAGE_DATA_TYPE_HALF4: - return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); case IMAGE_DATA_TYPE_BYTE4: - return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); case IMAGE_DATA_TYPE_FLOAT4: default: - return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y); + return TextureInterpolator::interp(info, x, y); } } -ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z) +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) { - switch(kernel_tex_type(tex)) { + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + InterpolationType interp = (InterpolationType)info.interpolation; + + switch(kernel_tex_type(id)) { case IMAGE_DATA_TYPE_HALF: - return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_BYTE: - return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_FLOAT: - return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_HALF4: - return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_BYTE4: - return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_FLOAT4: default: - return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + return TextureInterpolator::interp_3d(info, x, y, z, interp); } } -ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation) +ccl_device float4 kernel_tex_image_interp_3d_ex(KernelGlobals *kg, int id, float x, float y, float z, int interp) { - switch(kernel_tex_type(tex)) { + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + + switch(kernel_tex_type(id)) { case IMAGE_DATA_TYPE_HALF: - return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_BYTE: - return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_FLOAT: - return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_HALF4: - return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_BYTE4: - return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); case IMAGE_DATA_TYPE_FLOAT4: default: - return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + return TextureInterpolator::interp_3d(info, x, y, z, interp); } } CCL_NAMESPACE_END -#endif // __KERNEL_CPU__ - - #endif // __KERNEL_CPU_IMAGE_H__ diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 1ac6afd167a..3c93e00ccf1 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -26,6 +26,7 @@ #include "kernel/kernel_math.h" #include "kernel/kernel_types.h" #include "kernel/kernel_globals.h" +#include "kernel/kernels/cuda/kernel_cuda_image.h" #include "kernel/kernel_film.h" #include "kernel/kernel_path.h" #include "kernel/kernel_path_branched.h" diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h new file mode 100644 index 00000000000..00f6954003d --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h @@ -0,0 +1,175 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if __CUDA_ARCH__ >= 300 + +/* Kepler */ + +ccl_device float4 kernel_tex_image_interp(void *kg, int id, float x, float y) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + + /* float4, byte4 and half4 */ + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + return tex2D(tex, x, y); + } + /* float, byte and half */ + else { + float f = tex2D(tex, x, y); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device float4 kernel_tex_image_interp_3d(void *kg, int id, float x, float y, float z) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + return tex3D(tex, x, y, z); + } + else { + float f = tex3D(tex, x, y, z); + return make_float4(f, f, f, 1.0f); + } +} + +#else + +/* Fermi */ + +ccl_device float4 kernel_tex_image_interp(void *kg, int id, float x, float y) +{ + float4 r; + switch(id) { + case 0: r = tex2D(__tex_image_float4_000, x, y); break; + case 8: r = tex2D(__tex_image_float4_008, x, y); break; + case 16: r = tex2D(__tex_image_float4_016, x, y); break; + case 24: r = tex2D(__tex_image_float4_024, x, y); break; + case 32: r = tex2D(__tex_image_float4_032, x, y); break; + case 1: r = tex2D(__tex_image_byte4_001, x, y); break; + case 9: r = tex2D(__tex_image_byte4_009, x, y); break; + case 17: r = tex2D(__tex_image_byte4_017, x, y); break; + case 25: r = tex2D(__tex_image_byte4_025, x, y); break; + case 33: r = tex2D(__tex_image_byte4_033, x, y); break; + case 41: r = tex2D(__tex_image_byte4_041, x, y); break; + case 49: r = tex2D(__tex_image_byte4_049, x, y); break; + case 57: r = tex2D(__tex_image_byte4_057, x, y); break; + case 65: r = tex2D(__tex_image_byte4_065, x, y); break; + case 73: r = tex2D(__tex_image_byte4_073, x, y); break; + case 81: r = tex2D(__tex_image_byte4_081, x, y); break; + case 89: r = tex2D(__tex_image_byte4_089, x, y); break; + case 97: r = tex2D(__tex_image_byte4_097, x, y); break; + case 105: r = tex2D(__tex_image_byte4_105, x, y); break; + case 113: r = tex2D(__tex_image_byte4_113, x, y); break; + case 121: r = tex2D(__tex_image_byte4_121, x, y); break; + case 129: r = tex2D(__tex_image_byte4_129, x, y); break; + case 137: r = tex2D(__tex_image_byte4_137, x, y); break; + case 145: r = tex2D(__tex_image_byte4_145, x, y); break; + case 153: r = tex2D(__tex_image_byte4_153, x, y); break; + case 161: r = tex2D(__tex_image_byte4_161, x, y); break; + case 169: r = tex2D(__tex_image_byte4_169, x, y); break; + case 177: r = tex2D(__tex_image_byte4_177, x, y); break; + case 185: r = tex2D(__tex_image_byte4_185, x, y); break; + case 193: r = tex2D(__tex_image_byte4_193, x, y); break; + case 201: r = tex2D(__tex_image_byte4_201, x, y); break; + case 209: r = tex2D(__tex_image_byte4_209, x, y); break; + case 217: r = tex2D(__tex_image_byte4_217, x, y); break; + case 225: r = tex2D(__tex_image_byte4_225, x, y); break; + case 233: r = tex2D(__tex_image_byte4_233, x, y); break; + case 241: r = tex2D(__tex_image_byte4_241, x, y); break; + case 249: r = tex2D(__tex_image_byte4_249, x, y); break; + case 257: r = tex2D(__tex_image_byte4_257, x, y); break; + case 265: r = tex2D(__tex_image_byte4_265, x, y); break; + case 273: r = tex2D(__tex_image_byte4_273, x, y); break; + case 281: r = tex2D(__tex_image_byte4_281, x, y); break; + case 289: r = tex2D(__tex_image_byte4_289, x, y); break; + case 297: r = tex2D(__tex_image_byte4_297, x, y); break; + case 305: r = tex2D(__tex_image_byte4_305, x, y); break; + case 313: r = tex2D(__tex_image_byte4_313, x, y); break; + case 321: r = tex2D(__tex_image_byte4_321, x, y); break; + case 329: r = tex2D(__tex_image_byte4_329, x, y); break; + case 337: r = tex2D(__tex_image_byte4_337, x, y); break; + case 345: r = tex2D(__tex_image_byte4_345, x, y); break; + case 353: r = tex2D(__tex_image_byte4_353, x, y); break; + case 361: r = tex2D(__tex_image_byte4_361, x, y); break; + case 369: r = tex2D(__tex_image_byte4_369, x, y); break; + case 377: r = tex2D(__tex_image_byte4_377, x, y); break; + case 385: r = tex2D(__tex_image_byte4_385, x, y); break; + case 393: r = tex2D(__tex_image_byte4_393, x, y); break; + case 401: r = tex2D(__tex_image_byte4_401, x, y); break; + case 409: r = tex2D(__tex_image_byte4_409, x, y); break; + case 417: r = tex2D(__tex_image_byte4_417, x, y); break; + case 425: r = tex2D(__tex_image_byte4_425, x, y); break; + case 433: r = tex2D(__tex_image_byte4_433, x, y); break; + case 441: r = tex2D(__tex_image_byte4_441, x, y); break; + case 449: r = tex2D(__tex_image_byte4_449, x, y); break; + case 457: r = tex2D(__tex_image_byte4_457, x, y); break; + case 465: r = tex2D(__tex_image_byte4_465, x, y); break; + case 473: r = tex2D(__tex_image_byte4_473, x, y); break; + case 481: r = tex2D(__tex_image_byte4_481, x, y); break; + case 489: r = tex2D(__tex_image_byte4_489, x, y); break; + case 497: r = tex2D(__tex_image_byte4_497, x, y); break; + case 505: r = tex2D(__tex_image_byte4_505, x, y); break; + case 513: r = tex2D(__tex_image_byte4_513, x, y); break; + case 521: r = tex2D(__tex_image_byte4_521, x, y); break; + case 529: r = tex2D(__tex_image_byte4_529, x, y); break; + case 537: r = tex2D(__tex_image_byte4_537, x, y); break; + case 545: r = tex2D(__tex_image_byte4_545, x, y); break; + case 553: r = tex2D(__tex_image_byte4_553, x, y); break; + case 561: r = tex2D(__tex_image_byte4_561, x, y); break; + case 569: r = tex2D(__tex_image_byte4_569, x, y); break; + case 577: r = tex2D(__tex_image_byte4_577, x, y); break; + case 585: r = tex2D(__tex_image_byte4_585, x, y); break; + case 593: r = tex2D(__tex_image_byte4_593, x, y); break; + case 601: r = tex2D(__tex_image_byte4_601, x, y); break; + case 609: r = tex2D(__tex_image_byte4_609, x, y); break; + case 617: r = tex2D(__tex_image_byte4_617, x, y); break; + case 625: r = tex2D(__tex_image_byte4_625, x, y); break; + case 633: r = tex2D(__tex_image_byte4_633, x, y); break; + case 641: r = tex2D(__tex_image_byte4_641, x, y); break; + case 649: r = tex2D(__tex_image_byte4_649, x, y); break; + case 657: r = tex2D(__tex_image_byte4_657, x, y); break; + case 665: r = tex2D(__tex_image_byte4_665, x, y); break; + default: r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return r; +} + +ccl_device float4 kernel_tex_image_interp_3d(void *kg, int id, float x, float y, float z) +{ + float4 r; + switch(id) { + case 0: r = tex3D(__tex_image_float4_3d_000, x, y, z); break; + case 8: r = tex3D(__tex_image_float4_3d_008, x, y, z); break; + case 16: r = tex3D(__tex_image_float4_3d_016, x, y, z); break; + case 24: r = tex3D(__tex_image_float4_3d_024, x, y, z); break; + case 32: r = tex3D(__tex_image_float4_3d_032, x, y, z); break; + } + return r; +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 66b6e19de84..9d5d784e140 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -20,7 +20,7 @@ #include "kernel/kernel_math.h" #include "kernel/kernel_types.h" #include "kernel/kernel_globals.h" -#include "kernel/kernel_image_opencl.h" +#include "kernel/kernels/opencl/kernel_opencl_image.h" #include "kernel/kernel_film.h" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h new file mode 100644 index 00000000000..514980e731e --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -0,0 +1,229 @@ +/* + * Copyright 2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* For OpenCL we do manual lookup and interpolation. */ + +ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) { + const uint tex_offset = id +#define KERNEL_TEX(type, name) + 1 +#include "kernel/kernel_textures.h" + ; + + return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset]; +} + +#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)] + +ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + const int texture_type = kernel_tex_type(id); + + /* Float4 */ + if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch(float4, info, offset); + } + /* Byte4 */ + else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch(uchar4, info, offset); + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + /* Float */ + else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch(float, info, offset); + return make_float4(f, f, f, 1.0f); + } + /* Byte */ + else { + uchar r = tex_fetch(uchar, info, offset); + float f = r * (1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if(x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width-1); +} + +ccl_device_inline float svm_image_texture_frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; +} + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + uint interpolation = info->interpolation; + uint extension = info->extension; + + /* Actual sampling. */ + float4 r; + int ix, iy, nix, niy; + if(interpolation == INTERPOLATION_CLOSEST) { + svm_image_texture_frac(x*width, &ix); + svm_image_texture_frac(y*height, &iy); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + } + + r = svm_image_texture_read(kg, id, offset + ix + iy*width); + } + else { /* INTERPOLATION_LINEAR */ + float tx = svm_image_texture_frac(x*width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*height - 0.5f, &iy); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + + nix = svm_image_texture_wrap_periodic(ix+1, width); + niy = svm_image_texture_wrap_periodic(iy+1, height); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + nix = svm_image_texture_wrap_clamp(ix+1, width); + niy = svm_image_texture_wrap_clamp(iy+1, height); + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + } + + r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); + r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); + r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); + r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); + } + return r; +} + + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + uint depth = info->depth; + uint interpolation = info->interpolation; + uint extension = info->extension; + + /* Actual sampling. */ + float4 r; + int ix, iy, iz, nix, niy, niz; + if(interpolation == INTERPOLATION_CLOSEST) { + svm_image_texture_frac(x*width, &ix); + svm_image_texture_frac(y*height, &iy); + svm_image_texture_frac(z*depth, &iz); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + iz = svm_image_texture_wrap_periodic(iz, depth); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + iz = svm_image_texture_wrap_clamp(iz, depth); + } + r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); + } + else { /* INTERPOLATION_LINEAR */ + float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); + + if(extension == EXTENSION_REPEAT) { + ix = svm_image_texture_wrap_periodic(ix, width); + iy = svm_image_texture_wrap_periodic(iy, height); + iz = svm_image_texture_wrap_periodic(iz, depth); + + nix = svm_image_texture_wrap_periodic(ix+1, width); + niy = svm_image_texture_wrap_periodic(iy+1, height); + niz = svm_image_texture_wrap_periodic(iz+1, depth); + } + else { + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + /* Fall through. */ + /* EXTENSION_EXTEND */ + nix = svm_image_texture_wrap_clamp(ix+1, width); + niy = svm_image_texture_wrap_clamp(iy+1, height); + niz = svm_image_texture_wrap_clamp(iz+1, depth); + + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); + iz = svm_image_texture_wrap_clamp(iz, depth); + } + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); + r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height); + r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height); + r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height); + + r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height); + r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); + r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); + r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); + } + return r; +} diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 8ad2e12b067..5b991bf065c 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -962,7 +962,7 @@ bool OSLRenderServices::texture(ustring filename, if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); + float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t); result[0] = rgba[0]; if(nchannels > 1) @@ -1043,7 +1043,7 @@ bool OSLRenderServices::texture3d(ustring filename, bool status; if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); + float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z); result[0] = rgba[0]; if(nchannels > 1) diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 558d327bc76..21886ee62ee 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -29,7 +29,10 @@ #endif #ifdef __KERNEL_OPENCL__ -# include "kernel/kernel_image_opencl.h" +# include "kernel/kernels/opencl/kernel_opencl_image.h" +#endif +#ifdef __KERNEL_CUDA__ +# include "kernel/kernels/cuda/kernel_cuda_image.h" #endif #ifdef __KERNEL_CPU__ # include "kernel/kernels/cpu/kernel_cpu_image.h" diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 6d6e92e73f6..4226e7adfe0 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -18,135 +18,7 @@ CCL_NAMESPACE_BEGIN ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { -#ifdef __KERNEL_CPU__ - float4 r = kernel_tex_image_interp(id, x, y); -#elif defined(__KERNEL_OPENCL__) float4 r = kernel_tex_image_interp(kg, id, x, y); -#else - float4 r; - -# if __CUDA_ARCH__ < 300 - /* not particularly proud of this massive switch, what are the - * alternatives? - * - use a single big 1D texture, and do our own lookup/filtering - * - group by size and use a 3d texture, performance impact - * - group into larger texture with some padding for correct lerp - * - * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler), - * and we cannot use all since we still need some for other storage */ - - switch(id) { - case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break; - case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; - case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; - case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; - case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; - case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; - case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; - case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; - case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; - case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break; - case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break; - case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break; - case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break; - case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break; - case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break; - case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break; - case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break; - case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break; - case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break; - case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break; - case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break; - case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break; - case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break; - case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break; - case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break; - case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break; - case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break; - case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break; - case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break; - case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break; - case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break; - case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break; - case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break; - case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break; - case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break; - case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break; - case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break; - case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break; - case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break; - case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break; - case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break; - case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break; - case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break; - case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break; - case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break; - case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break; - case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break; - case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break; - case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break; - case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break; - case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break; - case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break; - case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break; - case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break; - case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break; - case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break; - case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break; - case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break; - case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break; - case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break; - case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break; - case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break; - case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break; - case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break; - case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break; - case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break; - case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break; - case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break; - case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break; - case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break; - case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break; - case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break; - case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break; - case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } -# else - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - /* float4, byte4 and half4 */ - const int texture_type = kernel_tex_type(id); - if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || - texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4) - { - r = kernel_tex_image_interp_float4(tex, x, y); - } - /* float, byte and half */ - else { - float f = kernel_tex_image_interp_float(tex, x, y); - r = make_float4(f, f, f, 1.0f); - } -# endif -#endif - const float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index f4a5b2b2994..466480d21b6 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -42,29 +42,8 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, tfm.w = read_node_float(kg, offset); co = transform_point(&tfm, co); } - float4 r; -# if defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - const int texture_type = kernel_tex_type(id); - if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || - texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4) - { - r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); - } - else { - float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); - r = make_float4(f, f, f, 1.0f); - } -# else /* __CUDA_ARCH__ >= 300 */ - r = volume_image_texture_3d(id, co.x, co.y, co.z); -# endif -# elif defined(__KERNEL_OPENCL__) - r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z); -# else - r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); -# endif /* __KERNEL_CUDA__ */ + + float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z); #else float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); #endif diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index df255f43059..f22948d9bcd 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -46,12 +46,64 @@ CCL_NAMESPACE_BEGIN #if defined (__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) # define kernel_tex_type(tex) (tex < TEX_START_BYTE4_CUDA ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_BYTE4) -# define kernel_tex_index(tex) (tex) #else # define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK) -# define kernel_tex_index(tex) (tex >> IMAGE_DATA_TYPE_SHIFT) #endif +/* Interpolation types for textures + * cuda also use texture space to store other objects */ +enum InterpolationType { + INTERPOLATION_NONE = -1, + INTERPOLATION_LINEAR = 0, + INTERPOLATION_CLOSEST = 1, + INTERPOLATION_CUBIC = 2, + INTERPOLATION_SMART = 3, + + INTERPOLATION_NUM_TYPES, +}; + +/* Texture types + * Since we store the type in the lower bits of a flat index, + * the shift and bit mask constant below need to be kept in sync. */ +enum ImageDataType { + IMAGE_DATA_TYPE_FLOAT4 = 0, + IMAGE_DATA_TYPE_BYTE4 = 1, + IMAGE_DATA_TYPE_HALF4 = 2, + IMAGE_DATA_TYPE_FLOAT = 3, + IMAGE_DATA_TYPE_BYTE = 4, + IMAGE_DATA_TYPE_HALF = 5, + + IMAGE_DATA_NUM_TYPES +}; + +#define IMAGE_DATA_TYPE_SHIFT 3 +#define IMAGE_DATA_TYPE_MASK 0x7 + +/* Extension types for textures. + * + * Defines how the image is extrapolated past its original bounds. */ +enum ExtensionType { + /* Cause the image to repeat horizontally and vertically. */ + EXTENSION_REPEAT = 0, + /* Extend by repeating edge pixels of the image. */ + EXTENSION_EXTEND = 1, + /* Clip to image size and set exterior pixels as transparent. */ + EXTENSION_CLIP = 2, + + EXTENSION_NUM_TYPES, +}; + +typedef struct TextureInfo { + /* Pointer, offset or texture depending on device. */ + uint64_t data; + /* Buffer number for OpenCL. */ + uint cl_buffer; + /* Interpolation and extension type. */ + uint interpolation, extension; + /* Dimensions. */ + uint width, height, depth; +} TextureInfo; + CCL_NAMESPACE_END #endif /* __UTIL_TEXTURE_H__ */ diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index aabca6c81fc..84206a7ba5a 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -101,52 +101,6 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple) return (x / multiple) * multiple; } -/* Interpolation types for textures - * cuda also use texture space to store other objects */ -enum InterpolationType { - INTERPOLATION_NONE = -1, - INTERPOLATION_LINEAR = 0, - INTERPOLATION_CLOSEST = 1, - INTERPOLATION_CUBIC = 2, - INTERPOLATION_SMART = 3, - - INTERPOLATION_NUM_TYPES, -}; - -/* Texture types - * Since we store the type in the lower bits of a flat index, - * the shift and bit mask constant below need to be kept in sync. - */ - -enum ImageDataType { - IMAGE_DATA_TYPE_FLOAT4 = 0, - IMAGE_DATA_TYPE_BYTE4 = 1, - IMAGE_DATA_TYPE_HALF4 = 2, - IMAGE_DATA_TYPE_FLOAT = 3, - IMAGE_DATA_TYPE_BYTE = 4, - IMAGE_DATA_TYPE_HALF = 5, - - IMAGE_DATA_NUM_TYPES -}; - -#define IMAGE_DATA_TYPE_SHIFT 3 -#define IMAGE_DATA_TYPE_MASK 0x7 - -/* Extension types for textures. - * - * Defines how the image is extrapolated past its original bounds. - */ -enum ExtensionType { - /* Cause the image to repeat horizontally and vertically. */ - EXTENSION_REPEAT = 0, - /* Extend by repeating edge pixels of the image. */ - EXTENSION_EXTEND = 1, - /* Clip to image size and set exterior pixels as transparent. */ - EXTENSION_CLIP = 2, - - EXTENSION_NUM_TYPES, -}; - CCL_NAMESPACE_END /* Vectorized types declaration. */ -- cgit v1.2.3