diff options
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 358 |
1 files changed, 200 insertions, 158 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index c951364b53a..278fff02ae1 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -553,7 +553,7 @@ public: } } - void generic_alloc(device_memory& mem) + void generic_alloc(device_memory& mem, size_t padding = 0) { CUDAContextScope scope(this); @@ -565,7 +565,7 @@ public: CUdeviceptr device_pointer; size_t size = mem.memory_size(); - cuda_assert(cuMemAlloc(&device_pointer, size)); + cuda_assert(cuMemAlloc(&device_pointer, size + padding)); mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); @@ -732,30 +732,8 @@ public: filter_mode = CU_TR_FILTER_MODE_LINEAR; } - /* General variables for Fermi */ - CUtexref texref = NULL; - - if(has_fermi_limits && mem.interpolation != INTERPOLATION_NONE) { - if(mem.data_depth > 1) { - /* Kernel uses different bind names for 2d and 3d float textures, - * so we have to adjust couple of things here. - */ - vector<string> tokens; - string_split(tokens, mem.name, "_"); - bind_name = string_printf("__tex_image_%s_3d_%s", - tokens[2].c_str(), - tokens[3].c_str()); - } - - cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - - if(!texref) { - return; - } - } - + /* Data Storage */ if(mem.interpolation == INTERPOLATION_NONE) { - /* Data Storage */ generic_alloc(mem); generic_copy_to(mem); @@ -774,179 +752,243 @@ public: uint32_t ptr = (uint32_t)mem.device_pointer; cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } + + tex_interp_map[mem.device_pointer] = false; + return; } - else { - /* Texture Storage */ - CUarray handle = NULL; - - CUarray_format_enum format; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; - default: assert(0); return; - } + /* Image Texture Storage */ + CUtexref texref = NULL; + + if(has_fermi_limits) { if(mem.data_depth > 1) { - CUDA_ARRAY3D_DESCRIPTOR desc; + /* Kernel uses different bind names for 2d and 3d float textures, + * so we have to adjust couple of things here. + */ + vector<string> tokens; + string_split(tokens, mem.name, "_"); + bind_name = string_printf("__tex_image_%s_3d_%s", + tokens[2].c_str(), + tokens[3].c_str()); + } - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; + cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - cuda_assert(cuArray3DCreate(&handle, &desc)); + if(!texref) { + return; } - else { - CUDA_ARRAY_DESCRIPTOR desc; + } - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; + CUarray_format_enum format; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; + default: assert(0); return; + } - cuda_assert(cuArrayCreate(&handle, &desc)); - } - if(!handle) { + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if(mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if(!array_3d) { return; } - /* Allocate 3D, 2D or 1D memory */ - if(mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - } - else if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2D(¶m)); - } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; - /* Fermi and Kepler */ - mem.device_pointer = (device_ptr)handle; - mem.device_size = size; + cuda_assert(cuMemcpy3D(¶m)); + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; stats.mem_alloc(size); + } + else if(mem.data_height > 1) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; - if(!has_fermi_limits) { - /* Bindless Textures - Kepler */ - int flat_slot = 0; - if(string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } + generic_alloc(mem, dst_size - mem.memory_size()); + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + generic_alloc(mem); + cuda_assert(cuMemcpyHtoD(mem.device_pointer, (void*)mem.data_pointer, size)); + } + + if(!has_fermi_limits) { + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if(string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if(mem.data_depth > 1) { resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = handle; + resDesc.res.array.hArray = array_3d; resDesc.flags = 0; + } + else if(mem.data_height > 1) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - CUtexObject tex = 0; - cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); + CUtexObject tex = 0; + cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); - /* Safety check */ - if((uint)tex > UINT_MAX) { - assert(0); - } + /* Safety check */ + if((uint)tex > UINT_MAX) { + assert(0); + } - /* Resize once */ - if(flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } + /* Resize once */ + if(flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } - /* Set Mapping and tag that we need to (re-)upload to device */ - TextureInfo& info = texture_info[flat_slot]; - info.data = (uint64_t)tex; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - - tex_bindless_map[mem.device_pointer] = tex; - need_texture_info = true; + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo& info = texture_info[flat_slot]; + info.data = (uint64_t)tex; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + tex_bindless_map[mem.device_pointer] = tex; + need_texture_info = true; + } + else { + /* Fermi, fixed texture slots. */ + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetArray(texref, array_3d, CU_TRSA_OVERRIDE_FORMAT)); + } + else if(mem.data_height > 1) { + CUDA_ARRAY_DESCRIPTOR array_desc; + array_desc.Format = format; + array_desc.Height = mem.data_height; + array_desc.Width = mem.data_width; + array_desc.NumChannels = mem.data_elements; + cuda_assert(cuTexRefSetAddress2D_v3(texref, &array_desc, mem.device_pointer, dst_pitch)); } else { - /* Regular Textures - Fermi */ - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); - cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); - if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); - } + cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); + } - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + /* Attach to texture reference. */ + cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); + cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); + cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); } } /* Fermi and Kepler */ - tex_interp_map[mem.device_pointer] = (mem.interpolation != INTERPOLATION_NONE); + tex_interp_map[mem.device_pointer] = true; } void tex_free(device_memory& mem) { if(mem.device_pointer) { - if(tex_interp_map[mem.device_pointer]) { + bool interp = tex_interp_map[mem.device_pointer]; + tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); + + if(interp) { CUDAContextScope scope(this); - cuArrayDestroy((CUarray)mem.device_pointer); - /* Free CUtexObject (Bindless Textures) */ - if(!info.has_fermi_limits && tex_bindless_map[mem.device_pointer]) { - CUtexObject tex = tex_bindless_map[mem.device_pointer]; - cuTexObjectDestroy(tex); + if(!info.has_fermi_limits) { + /* Free bindless texture. */ + if(tex_bindless_map[mem.device_pointer]) { + CUtexObject tex = tex_bindless_map[mem.device_pointer]; + cuTexObjectDestroy(tex); + } } - tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; + if(mem.data_depth > 1) { + /* Free array. */ + cuArrayDestroy((CUarray)mem.device_pointer); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + } + else { + generic_free(mem); + } } else { - tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); generic_free(mem); } } @@ -2174,8 +2216,8 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.num = num; info.advanced_shading = (major >= 2); - info.has_fermi_limits = (major < 3); - info.has_half_images = true; + info.has_fermi_limits = !(major >= 3); + info.has_half_images = (major >= 3); info.has_volume_decoupled = false; info.has_qbvh = false; |