diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-10-24 03:04:58 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-10-24 03:05:41 +0300 |
commit | f5456df095291c6cb2d0223a179746c8e514cd15 (patch) | |
tree | 76384b0c91806c95d55b007850ee225889a3250e /intern/cycles/device/device_cuda.cpp | |
parent | 254daf8f8c276a4e5292e5a12fcfa88296131878 (diff) | |
parent | 070a668d04844610059aaedc80c49e9038fd1779 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 305 |
1 files changed, 171 insertions, 134 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 0f057e9966f..c742e91c561 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -217,7 +217,8 @@ public: } CUDADevice(DeviceInfo& info, Stats &stats, bool background_) - : Device(info, stats, background_) + : Device(info, stats, background_), + texture_info(this, "__texture_info", MEM_TEXTURE) { first_error = true; background = background_; @@ -274,7 +275,7 @@ public: delete split_kernel; if(info.has_bindless_textures) { - tex_free(texture_info); + texture_info.free(); } cuda_assert(cuCtxDestroy(cuContext)); @@ -547,20 +548,19 @@ public: void load_texture_info() { if(info.has_bindless_textures && need_texture_info) { - tex_free(texture_info); - tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT); + texture_info.copy_to_device(); need_texture_info = false; } } - void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) + void generic_alloc(device_memory& mem) { CUDAContextScope scope(this); - if(name) { - VLOG(1) << "Buffer allocate: " << name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; } CUdeviceptr device_pointer; @@ -571,31 +571,88 @@ public: stats.mem_alloc(size); } + void generic_copy_to(device_memory& mem) + { + if(mem.device_pointer) { + CUDAContextScope scope(this); + cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); + } + } + + void generic_free(device_memory& mem) + { + if(mem.device_pointer) { + CUDAContextScope scope(this); + + cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); + + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + void mem_alloc(device_memory& mem) + { + if(mem.type == MEM_PIXELS && !background) { + pixels_alloc(mem); + } + else if(mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } + } + void mem_copy_to(device_memory& mem) { - CUDAContextScope scope(this); + if(mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if(mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if(!mem.device_pointer) { + generic_alloc(mem); + } - if(mem.device_pointer) - cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); + generic_copy_to(mem); + } } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { - CUDAContextScope scope(this); - size_t offset = elem*y*w; - size_t size = elem*w*h; - - if(mem.device_pointer) { - cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, - (CUdeviceptr)(mem.device_pointer + offset), size)); + if(mem.type == MEM_PIXELS && !background) { + pixels_copy_from(mem, y, w, h); + } + else if(mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); } else { - memset((char*)mem.data_pointer + offset, 0, size); + CUDAContextScope scope(this); + size_t offset = elem*y*w; + size_t size = elem*w*h; + + if(mem.device_pointer) { + cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, + (CUdeviceptr)(mem.device_pointer + offset), size)); + } + else { + memset((char*)mem.data_pointer + offset, 0, size); + } } } void mem_zero(device_memory& mem) { + if(!mem.device_pointer) { + mem_alloc(mem); + } + if(mem.data_pointer) { memset((void*)mem.data_pointer, 0, mem.memory_size()); } @@ -608,18 +665,18 @@ public: void mem_free(device_memory& mem) { - if(mem.device_pointer) { - CUDAContextScope scope(this); - cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); - - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; + if(mem.type == MEM_PIXELS && !background) { + pixels_free(mem); + } + else if(mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); } } - virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) { return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); } @@ -635,14 +692,11 @@ public: cuda_assert(cuMemcpyHtoD(mem, host, size)); } - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType interpolation, - ExtensionType extension) + void tex_alloc(device_memory& mem) { CUDAContextScope scope(this); - VLOG(1) << "Texture allocate: " << name << ", " + VLOG(1) << "Texture allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; @@ -650,12 +704,12 @@ public: bool has_bindless_textures = info.has_bindless_textures; /* General variables for both architectures */ - string bind_name = name; + string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(extension) { + switch(mem.extension) { case EXTENSION_REPEAT: address_mode = CU_TR_ADDRESS_MODE_WRAP; break; @@ -671,7 +725,7 @@ public: } CUfilter_mode filter_mode; - if(interpolation == INTERPOLATION_CLOSEST) { + if(mem.interpolation == INTERPOLATION_CLOSEST) { filter_mode = CU_TR_FILTER_MODE_POINT; } else { @@ -681,13 +735,13 @@ public: /* General variables for Fermi */ CUtexref texref = NULL; - if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) { + if(!has_bindless_textures && mem.interpolation != INTERPOLATION_NONE) { if(mem.data_depth > 1) { /* Kernel uses different bind names for 2d and 3d float textures, * so we have to adjust couple of things here. */ vector<string> tokens; - string_split(tokens, name, "_"); + string_split(tokens, mem.name, "_"); bind_name = string_printf("__tex_image_%s_3d_%s", tokens[2].c_str(), tokens[3].c_str()); @@ -700,10 +754,10 @@ public: } } - if(interpolation == INTERPOLATION_NONE) { + if(mem.interpolation == INTERPOLATION_NONE) { /* Data Storage */ - mem_alloc(NULL, mem, MEM_READ_ONLY); - mem_copy_to(mem); + generic_alloc(mem); + generic_copy_to(mem); CUdeviceptr cumem; size_t cubytes; @@ -802,9 +856,9 @@ public: if(has_bindless_textures) { /* Bindless Textures - Kepler */ int flat_slot = 0; - if(string_startswith(name, "__tex_image")) { - int pos = string(name).rfind("_"); - flat_slot = atoi(name + pos + 1); + if(string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); } else { assert(0); @@ -843,8 +897,8 @@ public: TextureInfo& info = texture_info[flat_slot]; info.data = (uint64_t)tex; info.cl_buffer = 0; - info.interpolation = interpolation; - info.extension = extension; + info.interpolation = mem.interpolation; + info.extension = mem.extension; info.width = mem.data_width; info.height = mem.data_height; info.depth = mem.data_depth; @@ -869,7 +923,7 @@ public: } /* Fermi and Kepler */ - tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); + tex_interp_map[mem.device_pointer] = (mem.interpolation != INTERPOLATION_NONE); } void tex_free(device_memory& mem) @@ -893,21 +947,19 @@ public: } else { tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); - mem_free(mem); + generic_free(mem); } } } bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); - TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; for(int i = 0; i < 9; i++) { tiles->buffers[i] = buffers[i]; } - mem_copy_to(task->tiles_mem); + task->tiles_mem.copy_to_device(); return !have_error(); } @@ -1274,7 +1326,7 @@ public: task.unmap_neighbor_tiles(rtiles, this); } - void path_trace(DeviceTask& task, RenderTile& rtile) + void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) { if(have_error()) return; @@ -1297,8 +1349,7 @@ public: cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); /* Allocate work tile. */ - device_vector<WorkTile> work_tiles; - work_tiles.resize(1); + work_tiles.alloc(1); WorkTile *wtile = work_tiles.get_data(); wtile->x = rtile.x; @@ -1308,9 +1359,6 @@ public: wtile->offset = rtile.offset; wtile->stride = rtile.stride; wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); - mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); - - CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); /* Prepare work size. More step samples render faster, but for now we * remain conservative for GPUs connected to a display to avoid driver @@ -1331,8 +1379,9 @@ public: /* Setup and copy work tile to device. */ wtile->start_sample = sample; wtile->num_samples = min(step_samples, end_sample - sample);; - mem_copy_to(work_tiles); + work_tiles.copy_to_device(); + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); uint total_work_size = wtile->w * wtile->h * wtile->num_samples; uint num_blocks = divide_up(total_work_size, num_threads_per_block); @@ -1356,8 +1405,6 @@ public: break; } } - - mem_free(work_tiles); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) @@ -1510,98 +1557,82 @@ public: void pixels_alloc(device_memory& mem) { - if(!background) { - PixelMem pmem; + PixelMem pmem; - pmem.w = mem.data_width; - pmem.h = mem.data_height; + pmem.w = mem.data_width; + pmem.h = mem.data_height; - CUDAContextScope scope(this); + CUDAContextScope scope(this); - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if(mem.data_type == TYPE_HALF) - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); + glGenBuffers(1, &pmem.cuPBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + if(mem.data_type == TYPE_HALF) + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + glGenTextures(1, &pmem.cuTexId); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if(mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); - CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - if(result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; + if(result == CUDA_SUCCESS) { + mem.device_pointer = pmem.cuTexId; + pixel_mem_map[mem.device_pointer] = pmem; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } + return; } + else { + /* failed to register buffer, fallback to no interop */ + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); - Device::pixels_alloc(mem); + background = true; + } } void pixels_copy_from(device_memory& mem, int y, int w, int h) { - if(!background) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar)*4*y*w; - memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; - return; - } + CUDAContextScope scope(this); - Device::pixels_copy_from(mem, y, w, h); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); + size_t offset = sizeof(uchar)*4*y*w; + memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); } void pixels_free(device_memory& mem) { if(mem.device_pointer) { - if(!background) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; + CUDAContextScope scope(this); - stats.mem_free(mem.device_size); - mem.device_size = 0; + cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); - return; - } + pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); + mem.device_pointer = 0; - Device::pixels_free(mem); + stats.mem_free(mem.device_size); + mem.device_size = 0; } } @@ -1611,6 +1642,8 @@ public: int dx, int dy, int dw, int dh, bool transparent, const DeviceDrawParams &draw_params) { + assert(mem.type == MEM_PIXELS); + if(!background) { const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); PixelMem pmem = pixel_mem_map[mem.device_pointer]; @@ -1747,15 +1780,17 @@ public: } } + device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + /* keep rendering tiles until done */ while(task->acquire_tile(this, tile)) { if(tile.task == RenderTile::PATH_TRACE) { if(use_split_kernel()) { - device_memory void_buffer; + device_only_memory<uchar> void_buffer(this, "void_buffer"); split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - path_trace(*task, tile); + path_trace(*task, tile, work_tiles); } } else if(tile.task == RenderTile::DENOISE) { @@ -1773,6 +1808,8 @@ public: break; } } + + work_tiles.free(); } else if(task->type == DeviceTask::SHADER) { shader(*task); @@ -1906,9 +1943,9 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory { CUDAContextScope scope(device); - device_vector<uint64_t> size_buffer; - size_buffer.resize(1); - device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); uint threads = num_threads; CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); @@ -1931,9 +1968,9 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory 1, 1, 1, 0, 0, (void**)&args, 0)); - device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); + size_buffer.copy_from_device(0, 1, 1); size_t size = size_buffer[0]; - device->mem_free(size_buffer); + size_buffer.free(); return size; } |