Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTon Roosendaal <ton@blender.org>2011-04-27 15:58:34 +0400
committerTon Roosendaal <ton@blender.org>2011-04-27 15:58:34 +0400
commitda376e0237517543aa21740ee2363234ee1c20ae (patch)
tree014a513ed8d0eccc5e54fef42347781e85bae56a /intern/cycles/device/device_cuda.cpp
parent693780074388111e7b9ef1c3825e462f398dc6c4 (diff)
Cycles render engine, initial commit. This is the engine itself, blender modifications and build instructions will follow later.
Cycles uses code from some great open source projects, many thanks them: * BVH building and traversal code from NVidia's "Understanding the Efficiency of Ray Traversal on GPUs": http://code.google.com/p/understanding-the-efficiency-of-ray-traversal-on-gpus/ * Open Shading Language for a large part of the shading system: http://code.google.com/p/openshadinglanguage/ * Blender for procedural textures and a few other nodes. * Approximate Catmull Clark subdivision from NVidia Mesh tools: http://code.google.com/p/nvidia-mesh-tools/ * Sobol direction vectors from: http://web.maths.unsw.edu.au/~fkuo/sobol/ * Film response functions from: http://www.cs.columbia.edu/CAVE/software/softlib/dorf.php
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r--intern/cycles/device/device_cuda.cpp682
1 files changed, 682 insertions, 0 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
new file mode 100644
index 00000000000..76692ba8657
--- /dev/null
+++ b/intern/cycles/device/device_cuda.cpp
@@ -0,0 +1,682 @@
+/*
+ * Copyright 2011, Blender Foundation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "device.h"
+#include "device_intern.h"
+
+#include "util_cuda.h"
+#include "util_debug.h"
+#include "util_map.h"
+#include "util_opengl.h"
+#include "util_path.h"
+#include "util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice : public Device
+{
+public:
+ CUdevice cuDevice;
+ CUcontext cuContext;
+ CUmodule cuModule;
+ map<device_ptr, bool> tex_interp_map;
+ int cuDevId;
+
+ struct PixelMem {
+ GLuint cuPBO;
+ CUgraphicsResource cuPBOresource;
+ GLuint cuTexId;
+ int w, h;
+ };
+
+ map<device_ptr, PixelMem> pixel_mem_map;
+
+ CUdeviceptr cuda_device_ptr(device_ptr mem)
+ {
+ return (CUdeviceptr)mem;
+ }
+
+ const char *cuda_error_string(CUresult result)
+ {
+ switch(result) {
+ case CUDA_SUCCESS: return "No errors";
+ case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
+ case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
+ case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
+ case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
+
+ case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
+ case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
+
+ case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
+ case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
+ case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
+ case CUDA_ERROR_MAP_FAILED: return "Map failed";
+ case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
+ case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
+ case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
+ case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
+ case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
+ case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
+ case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
+ case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
+ case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
+ case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
+
+ case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
+ case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
+ case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
+ case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
+
+ case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
+
+ case CUDA_ERROR_NOT_FOUND: return "Not found";
+
+ case CUDA_ERROR_NOT_READY: return "CUDA not ready";
+
+ case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
+ case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
+ case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
+ case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
+
+ case CUDA_ERROR_UNKNOWN: return "Unknown error";
+
+ default: return "Unknown CUDA error value";
+ }
+ }
+
+ static int cuda_align_up(int& offset, int alignment)
+ {
+ return (offset + alignment - 1) & ~(alignment - 1);
+ }
+
+#ifdef NDEBUG
+#define cuda_abort()
+#else
+#define cuda_abort() abort()
+#endif
+
+#define cuda_assert(stmt) \
+ { \
+ CUresult result = stmt; \
+ \
+ if(result != CUDA_SUCCESS) { \
+ fprintf(stderr, "CUDA error: %s in %s\n", cuda_error_string(result), #stmt); \
+ cuda_abort(); \
+ } \
+ }
+
+ void cuda_push_context()
+ {
+ cuda_assert(cuCtxSetCurrent(cuContext))
+ }
+
+ void cuda_pop_context()
+ {
+ cuda_assert(cuCtxSetCurrent(NULL));
+ }
+
+ CUDADevice(bool background_)
+ {
+ int major, minor;
+ background = background_;
+
+ cuDevId = 0;
+
+ /* intialize */
+ cuda_assert(cuInit(0))
+
+ /* setup device and context */
+ cuda_assert(cuDeviceGet(&cuDevice, cuDevId))
+
+ if(background)
+ cuda_assert(cuCtxCreate(&cuContext, 0, cuDevice))
+ else
+ cuda_assert(cuGLCtxCreate(&cuContext, 0, cuDevice))
+
+ /* open module */
+ cuDeviceComputeCapability(&major, &minor, cuDevId);
+ string cubin = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
+ cuda_assert(cuModuleLoad(&cuModule, path_get(cubin).c_str()))
+
+ cuda_pop_context();
+ }
+
+ ~CUDADevice()
+ {
+ cuda_push_context();
+ cuda_assert(cuCtxDetach(cuContext))
+ }
+
+ string description()
+ {
+ /* print device information */
+ char deviceName[100];
+
+ cuda_push_context();
+ cuDeviceGetName(deviceName, 256, cuDevId);
+ cuda_pop_context();
+
+ return string("CUDA ") + deviceName;
+ }
+
+ void mem_alloc(device_memory& mem, MemoryType type)
+ {
+ cuda_push_context();
+ CUdeviceptr device_pointer;
+ cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
+ mem.device_pointer = (device_ptr)device_pointer;
+ cuda_pop_context();
+ }
+
+ void mem_copy_to(device_memory& mem)
+ {
+ cuda_push_context();
+ cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
+ cuda_pop_context();
+ }
+
+ void mem_copy_from(device_memory& mem, size_t offset, size_t size)
+ {
+ /* todo: offset is ignored */
+ cuda_push_context();
+ cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
+ (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
+ cuda_pop_context();
+ }
+
+ void mem_zero(device_memory& mem)
+ {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+
+ cuda_push_context();
+ cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
+ cuda_pop_context();
+ }
+
+ void mem_free(device_memory& mem)
+ {
+ if(mem.device_pointer) {
+ cuda_push_context();
+ cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
+ cuda_pop_context();
+
+ mem.device_pointer = 0;
+ }
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ CUdeviceptr mem;
+ size_t bytes;
+
+ cuda_push_context();
+ cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
+ assert(bytes == size);
+ cuda_assert(cuMemcpyHtoD(mem, host, size))
+ cuda_pop_context();
+ }
+
+ void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
+ {
+ /* determine format */
+ CUarray_format_enum format;
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ switch(mem.data_type) {
+ case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+ case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+ case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+ case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+ default: assert(0); return;
+ }
+
+ CUtexref texref;
+
+ cuda_push_context();
+ cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
+
+ if(interpolation) {
+ CUarray handle;
+ CUDA_ARRAY_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+
+ cuda_assert(cuArrayCreate(&handle, &desc))
+
+ if(mem.data_height > 1) {
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = handle;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = (void*)mem.data_pointer;
+ param.srcPitch = mem.data_width*dsize*mem.data_elements;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2D(&param))
+ }
+ else
+ cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+
+ cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+
+ cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
+ cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+
+ mem.device_pointer = (device_ptr)handle;
+ }
+ else {
+ cuda_pop_context();
+
+ mem_alloc(mem, MEM_READ_ONLY);
+ mem_copy_to(mem);
+
+ cuda_push_context();
+
+ cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
+ cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
+ cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
+ }
+
+ if(periodic) {
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
+ }
+ else {
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
+ }
+ cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
+
+ cuda_pop_context();
+
+ tex_interp_map[mem.device_pointer] = interpolation;
+ }
+
+ void tex_free(device_memory& mem)
+ {
+ if(mem.device_pointer) {
+ if(tex_interp_map[mem.device_pointer]) {
+ cuda_push_context();
+ cuArrayDestroy((CUarray)mem.device_pointer);
+ cuda_pop_context();
+
+ tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
+ mem.device_pointer = 0;
+ }
+ else {
+ tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
+ mem_free(mem);
+ }
+ }
+ }
+
+ void path_trace(DeviceTask& task)
+ {
+ cuda_push_context();
+
+ CUfunction cuPathTrace;
+ CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
+ CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
+
+ /* get kernel function */
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
+
+ /* pass in parameters */
+ int offset = 0;
+
+ cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
+ offset += sizeof(d_buffer);
+
+ cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
+ offset += sizeof(d_rng_state);
+
+ offset = cuda_align_up(offset, __alignof(task.pass));
+
+ cuda_assert(cuParamSeti(cuPathTrace, offset, task.pass))
+ offset += sizeof(task.pass);
+
+ cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
+ offset += sizeof(task.x);
+
+ cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
+ offset += sizeof(task.y);
+
+ cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
+ offset += sizeof(task.w);
+
+ cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
+ offset += sizeof(task.h);
+
+ cuda_assert(cuParamSetSize(cuPathTrace, offset))
+
+ /* launch kernel: todo find optimal size, cache config for fermi */
+#ifndef __APPLE__
+ int xthreads = 16;
+ int ythreads = 16;
+#else
+ int xthreads = 8;
+ int ythreads = 8;
+#endif
+ int xblocks = (task.w + xthreads - 1)/xthreads;
+ int yblocks = (task.h + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
+ cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
+ cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
+
+ cuda_pop_context();
+ }
+
+ void tonemap(DeviceTask& task)
+ {
+ cuda_push_context();
+
+ CUfunction cuFilmConvert;
+ CUdeviceptr d_rgba = map_pixels(task.rgba);
+ CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
+
+ /* get kernel function */
+ cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
+
+ /* pass in parameters */
+ int offset = 0;
+
+ cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
+ offset += sizeof(d_rgba);
+
+ cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
+ offset += sizeof(d_buffer);
+
+ offset = cuda_align_up(offset, __alignof(task.pass));
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.pass))
+ offset += sizeof(task.pass);
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
+ offset += sizeof(task.resolution);
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
+ offset += sizeof(task.x);
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
+ offset += sizeof(task.y);
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
+ offset += sizeof(task.w);
+
+ cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
+ offset += sizeof(task.h);
+
+ cuda_assert(cuParamSetSize(cuFilmConvert, offset))
+
+ /* launch kernel: todo find optimal size, cache config for fermi */
+#ifndef __APPLE__
+ int xthreads = 16;
+ int ythreads = 16;
+#else
+ int xthreads = 8;
+ int ythreads = 8;
+#endif
+ int xblocks = (task.w + xthreads - 1)/xthreads;
+ int yblocks = (task.h + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
+ cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
+ cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
+
+ unmap_pixels(task.rgba);
+
+ cuda_pop_context();
+ }
+
+ void displace(DeviceTask& task)
+ {
+ cuda_push_context();
+
+ CUfunction cuDisplace;
+ CUdeviceptr d_input = cuda_device_ptr(task.displace_input);
+ CUdeviceptr d_offset = cuda_device_ptr(task.displace_offset);
+
+ /* get kernel function */
+ cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_displace"))
+
+ /* pass in parameters */
+ int offset = 0;
+
+ cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
+ offset += sizeof(d_input);
+
+ cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
+ offset += sizeof(d_offset);
+
+ offset = cuda_align_up(offset, __alignof(task.displace_x));
+
+ cuda_assert(cuParamSeti(cuDisplace, offset, task.displace_x))
+ offset += sizeof(task.displace_x);
+
+ cuda_assert(cuParamSetSize(cuDisplace, offset))
+
+ /* launch kernel: todo find optimal size, cache config for fermi */
+#ifndef __APPLE__
+ int xthreads = 16;
+#else
+ int xthreads = 8;
+#endif
+ int xblocks = (task.displace_w + xthreads - 1)/xthreads;
+
+ cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
+ cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
+ cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
+
+ cuda_pop_context();
+ }
+
+ CUdeviceptr map_pixels(device_ptr mem)
+ {
+ if(!background) {
+ PixelMem pmem = pixel_mem_map[mem];
+ CUdeviceptr buffer;
+
+ size_t bytes;
+ cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
+ cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
+
+ return buffer;
+ }
+
+ return cuda_device_ptr(mem);
+ }
+
+ void unmap_pixels(device_ptr mem)
+ {
+ if(!background) {
+ PixelMem pmem = pixel_mem_map[mem];
+
+ cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
+ }
+ }
+
+ void pixels_alloc(device_memory& mem)
+ {
+ if(!background) {
+ PixelMem pmem;
+
+ pmem.w = mem.data_width;
+ pmem.h = mem.data_height;
+
+ cuda_push_context();
+
+ glGenBuffers(1, &pmem.cuPBO);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+ glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ glGenTextures(1, &pmem.cuTexId);
+ glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ cuda_assert(cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE))
+
+ cuda_pop_context();
+
+ mem.device_pointer = pmem.cuTexId;
+ pixel_mem_map[mem.device_pointer] = pmem;
+
+ return;
+ }
+
+ Device::pixels_alloc(mem);
+ }
+
+ void pixels_copy_from(device_memory& mem, int y, int w, int h)
+ {
+ if(!background) {
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+ cuda_push_context();
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+ uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+ size_t offset = sizeof(uchar)*4*y*w;
+ memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
+ glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ cuda_pop_context();
+
+ return;
+ }
+
+ Device::pixels_copy_from(mem, y, w, h);
+ }
+
+ void pixels_free(device_memory& mem)
+ {
+ if(mem.device_pointer) {
+ if(!background) {
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+ cuda_push_context();
+
+ cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
+ glDeleteBuffers(1, &pmem.cuPBO);
+ glDeleteTextures(1, &pmem.cuTexId);
+
+ cuda_pop_context();
+
+ pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+ mem.device_pointer = 0;
+
+ return;
+ }
+
+ Device::pixels_free(mem);
+ }
+ }
+
+ void draw_pixels(device_memory& mem, int y, int w, int h, int width, int height)
+ {
+ if(!background) {
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+ cuda_push_context();
+
+ glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
+ glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+ glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+
+ glEnable(GL_TEXTURE_2D);
+
+ glColor3f(1.0f, 1.0f, 1.0f);
+
+ glPushMatrix();
+ glTranslatef(0, y, 0.0f);
+
+ glBegin(GL_QUADS);
+
+ glTexCoord2f(0, 0);
+ glVertex2f(0, 0);
+ glTexCoord2f((float)w/(float)width, 0);
+ glVertex2f(width, 0);
+ glTexCoord2f((float)w/(float)width, (float)h/(float)height);
+ glVertex2f(width, height);
+ glTexCoord2f(0, (float)h/(float)height);
+ glVertex2f(0, height);
+
+ glEnd();
+
+ glPopMatrix();
+
+ glBindTexture(GL_TEXTURE_2D, 0);
+ glDisable(GL_TEXTURE_2D);
+
+ cuda_pop_context();
+
+ return;
+ }
+
+ Device::draw_pixels(mem, y, w, h, width, height);
+ }
+
+ void task_add(DeviceTask& task)
+ {
+ if(task.type == DeviceTask::TONEMAP)
+ tonemap(task);
+ else if(task.type == DeviceTask::PATH_TRACE)
+ path_trace(task);
+ else if(task.type == DeviceTask::DISPLACE)
+ displace(task);
+ }
+
+ void task_wait()
+ {
+ cuda_push_context();
+
+ cuda_assert(cuCtxSynchronize())
+
+ cuda_pop_context();
+ }
+
+ void task_cancel()
+ {
+ }
+};
+
+Device *device_cuda_create(bool background)
+{
+ return new CUDADevice(background);
+}
+
+CCL_NAMESPACE_END
+