From 247b869967812891f6b77585184a3b09f3f16a18 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Mon, 6 Oct 2014 14:59:26 +0200 Subject: Compositor: implement OpenCL backend for gaussian blur Pretty much straightforward change which gives around 30% speedup on my laptop and around 2x speedup on desktop in the BI (which uses gts580). Tested with huge blurs (like 10% of blur) which was rather common during Caminandes. For now OpenCL is only limited for blur size more than 100 pixels. This is a bit experimental still, feedback is welcome. Reviewers: jbakker, lukastoenne Subscribers: ton Differential Revision: https://developer.blender.org/D576 --- source/blender/compositor/nodes/COM_BlurNode.cpp | 2 + .../operations/COM_GaussianXBlurOperation.cpp | 27 ++++++++++ .../operations/COM_GaussianXBlurOperation.h | 11 +++- .../operations/COM_GaussianYBlurOperation.cpp | 27 ++++++++++ .../operations/COM_GaussianYBlurOperation.h | 11 +++- .../compositor/operations/COM_OpenCLKernels.cl | 63 ++++++++++++++++++++++ 6 files changed, 139 insertions(+), 2 deletions(-) diff --git a/source/blender/compositor/nodes/COM_BlurNode.cpp b/source/blender/compositor/nodes/COM_BlurNode.cpp index 76e52c14685..f3d0c33d3b3 100644 --- a/source/blender/compositor/nodes/COM_BlurNode.cpp +++ b/source/blender/compositor/nodes/COM_BlurNode.cpp @@ -105,6 +105,7 @@ void BlurNode::convertToOperations(NodeConverter &converter, const CompositorCon GaussianXBlurOperation *operationx = new GaussianXBlurOperation(); operationx->setData(data); operationx->setQuality(quality); + operationx->checkOpenCL(); converter.addOperation(operationx); converter.mapInputSocket(getInputSocket(1), operationx->getInputSocket(1)); @@ -112,6 +113,7 @@ void BlurNode::convertToOperations(NodeConverter &converter, const CompositorCon GaussianYBlurOperation *operationy = new GaussianYBlurOperation(); operationy->setData(data); operationy->setQuality(quality); + operationy->checkOpenCL(); converter.addOperation(operationy); converter.mapInputSocket(getInputSocket(1), operationy->getInputSocket(1)); diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp index 0aefba3bb7c..0838d281de7 100644 --- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp +++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp @@ -21,6 +21,7 @@ */ #include "COM_GaussianXBlurOperation.h" +#include "COM_OpenCLDevice.h" #include "BLI_math.h" #include "MEM_guardedalloc.h" @@ -124,6 +125,32 @@ void GaussianXBlurOperation::executePixel(float output[4], int x, int y, void *d mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum); } +void GaussianXBlurOperation::executeOpenCL(OpenCLDevice *device, + MemoryBuffer *outputMemoryBuffer, cl_mem clOutputBuffer, + MemoryBuffer **inputMemoryBuffers, list *clMemToCleanUp, + list *clKernelsToCleanUp) +{ + cl_kernel gaussianXBlurOperationKernel = device->COM_clCreateKernel("gaussianXBlurOperationKernel", NULL); + cl_int filter_size = this->m_filtersize; + + cl_mem gausstab = clCreateBuffer(device->getContext(), + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + sizeof(float) * (this->m_filtersize * 2 + 1), + this->m_gausstab, + NULL); + + device->COM_clAttachMemoryBufferToKernelParameter(gaussianXBlurOperationKernel, 0, 1, clMemToCleanUp, inputMemoryBuffers, this->m_inputProgram); + device->COM_clAttachOutputMemoryBufferToKernelParameter(gaussianXBlurOperationKernel, 2, clOutputBuffer); + device->COM_clAttachMemoryBufferOffsetToKernelParameter(gaussianXBlurOperationKernel, 3, outputMemoryBuffer); + clSetKernelArg(gaussianXBlurOperationKernel, 4, sizeof(cl_int), &filter_size); + device->COM_clAttachSizeToKernelParameter(gaussianXBlurOperationKernel, 5, this); + clSetKernelArg(gaussianXBlurOperationKernel, 6, sizeof(cl_mem), &gausstab); + + device->COM_clEnqueueRange(gaussianXBlurOperationKernel, outputMemoryBuffer, 7, this); + + clReleaseMemObject(gausstab); +} + void GaussianXBlurOperation::deinitExecution() { BlurBaseOperation::deinitExecution(); diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h index e391320a007..d7ae8b1e3dc 100644 --- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h +++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h @@ -40,7 +40,12 @@ public: * @brief the inner loop of this program */ void executePixel(float output[4], int x, int y, void *data); - + + void executeOpenCL(OpenCLDevice *device, + MemoryBuffer *outputMemoryBuffer, cl_mem clOutputBuffer, + MemoryBuffer **inputMemoryBuffers, list *clMemToCleanUp, + list *clKernelsToCleanUp); + /** * @brief initialize the execution */ @@ -53,5 +58,9 @@ public: void *initializeTileData(rcti *rect); bool determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output); + + void checkOpenCL() { + this->setOpenCL(m_data.sizex >= 128); + } }; #endif diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp index a05a1ab6a23..6172f954087 100644 --- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp +++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp @@ -21,6 +21,7 @@ */ #include "COM_GaussianYBlurOperation.h" +#include "COM_OpenCLDevice.h" #include "BLI_math.h" #include "MEM_guardedalloc.h" @@ -126,6 +127,32 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum); } +void GaussianYBlurOperation::executeOpenCL(OpenCLDevice *device, + MemoryBuffer *outputMemoryBuffer, cl_mem clOutputBuffer, + MemoryBuffer **inputMemoryBuffers, list *clMemToCleanUp, + list *clKernelsToCleanUp) +{ + cl_kernel gaussianYBlurOperationKernel = device->COM_clCreateKernel("gaussianYBlurOperationKernel", NULL); + cl_int filter_size = this->m_filtersize; + + cl_mem gausstab = clCreateBuffer(device->getContext(), + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + sizeof(float) * (this->m_filtersize * 2 + 1), + this->m_gausstab, + NULL); + + device->COM_clAttachMemoryBufferToKernelParameter(gaussianYBlurOperationKernel, 0, 1, clMemToCleanUp, inputMemoryBuffers, this->m_inputProgram); + device->COM_clAttachOutputMemoryBufferToKernelParameter(gaussianYBlurOperationKernel, 2, clOutputBuffer); + device->COM_clAttachMemoryBufferOffsetToKernelParameter(gaussianYBlurOperationKernel, 3, outputMemoryBuffer); + clSetKernelArg(gaussianYBlurOperationKernel, 4, sizeof(cl_int), &filter_size); + device->COM_clAttachSizeToKernelParameter(gaussianYBlurOperationKernel, 5, this); + clSetKernelArg(gaussianYBlurOperationKernel, 6, sizeof(cl_mem), &gausstab); + + device->COM_clEnqueueRange(gaussianYBlurOperationKernel, outputMemoryBuffer, 7, this); + + clReleaseMemObject(gausstab); +} + void GaussianYBlurOperation::deinitExecution() { BlurBaseOperation::deinitExecution(); diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h index 22b6562077d..4b5751c0968 100644 --- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h +++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h @@ -40,7 +40,12 @@ public: * the inner loop of this program */ void executePixel(float output[4], int x, int y, void *data); - + + void executeOpenCL(OpenCLDevice *device, + MemoryBuffer *outputMemoryBuffer, cl_mem clOutputBuffer, + MemoryBuffer **inputMemoryBuffers, list *clMemToCleanUp, + list *clKernelsToCleanUp); + /** * @brief initialize the execution */ @@ -53,5 +58,9 @@ public: void *initializeTileData(rcti *rect); bool determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output); + + void checkOpenCL() { + this->setOpenCL(m_data.sizex >= 128); + } }; #endif diff --git a/source/blender/compositor/operations/COM_OpenCLKernels.cl b/source/blender/compositor/operations/COM_OpenCLKernels.cl index 00b3825d8b3..1b965eb8659 100644 --- a/source/blender/compositor/operations/COM_OpenCLKernels.cl +++ b/source/blender/compositor/operations/COM_OpenCLKernels.cl @@ -250,3 +250,66 @@ __kernel void directionalBlurKernel(__read_only image2d_t inputImage, __write_o write_imagef(output, coords, col); } + +// KERNEL --- GAUSSIAN BLUR --- +__kernel void gaussianXBlurOperationKernel(__read_only image2d_t inputImage, + int2 offsetInput, + __write_only image2d_t output, + int2 offsetOutput, + int filter_size, + int2 dimension, + __global float *gausstab, + int2 offset) +{ + float4 color = {0.0f, 0.0f, 0.0f, 0.0f}; + int2 coords = {get_global_id(0), get_global_id(1)}; + coords += offset; + const int2 realCoordinate = coords + offsetOutput; + int2 inputCoordinate = realCoordinate - offsetInput; + float weight = 0.0f; + + int xmin = max(realCoordinate.x - filter_size, 0) - offsetInput.x; + int xmax = min(realCoordinate.x + filter_size + 1, dimension.x) - offsetInput.x; + + for (int nx = xmin, i = max(filter_size - realCoordinate.x, 0); nx < xmax; ++nx, ++i) { + float w = gausstab[i]; + inputCoordinate.x = nx; + color += read_imagef(inputImage, SAMPLER_NEAREST, inputCoordinate) * w; + weight += w; + } + + color *= (1.0f / weight); + + write_imagef(output, coords, color); +} + +__kernel void gaussianYBlurOperationKernel(__read_only image2d_t inputImage, + int2 offsetInput, + __write_only image2d_t output, + int2 offsetOutput, + int filter_size, + int2 dimension, + __global float *gausstab, + int2 offset) +{ + float4 color = {0.0f, 0.0f, 0.0f, 0.0f}; + int2 coords = {get_global_id(0), get_global_id(1)}; + coords += offset; + const int2 realCoordinate = coords + offsetOutput; + int2 inputCoordinate = realCoordinate - offsetInput; + float weight = 0.0f; + + int ymin = max(realCoordinate.y - filter_size, 0) - offsetInput.y; + int ymax = min(realCoordinate.y + filter_size + 1, dimension.y) - offsetInput.y; + + for (int ny = ymin, i = max(filter_size - realCoordinate.y, 0); ny < ymax; ++ny, ++i) { + float w = gausstab[i]; + inputCoordinate.y = ny; + color += read_imagef(inputImage, SAMPLER_NEAREST, inputCoordinate) * w; + weight += w; + } + + color *= (1.0f / weight); + + write_imagef(output, coords, color); +} -- cgit v1.2.3