14 files changed, 211 insertions, 158 deletions
diff --git a/source/blender/compositor/intern/COM_ChunkOrderHotspot.cpp b/source/blender/compositor/intern/COM_ChunkOrderHotspot.cpp
index 94110f0bcfe..0ab08ec5810 100644
--- a/source/blender/compositor/intern/COM_ChunkOrderHotspot.cpp
+++ b/source/blender/compositor/intern/COM_ChunkOrderHotspot.cpp
@@ -35,6 +35,6 @@ double ChunkOrderHotspot::determineDistance(int x, int y)
 	int dx = x-this->x;
 	int dy = y-this->y;
 	double result = sqrt((double)(dx*dx+dy*dy));
-	result += this->addition;
+	result += (double)this->addition;
 	return result;
 }
diff --git a/source/blender/compositor/intern/COM_Converter.cpp b/source/blender/compositor/intern/COM_Converter.cpp
index 3cb297801ca..dc6409e7b86 100644
--- a/source/blender/compositor/intern/COM_Converter.cpp
+++ b/source/blender/compositor/intern/COM_Converter.cpp
@@ -111,6 +111,7 @@
 #include "COM_DefocusNode.h"
 #include "COM_DoubleEdgeMaskNode.h"
 #include "COM_CropNode.h"
+#include "COM_MaskNode.h"
 
 Node *Converter::convert(bNode *bNode)
 {
@@ -347,6 +348,9 @@ case CMP_NODE_OUTPUT_FILE:
 	case CMP_NODE_CROP:
 		node = new CropNode(bNode);
 		break;
+	case CMP_NODE_MASK:
+		node = new MaskNode(bNode);
+		break;
 	/* not inplemented yet */
 	default:
 		node = new MuteNode(bNode);
diff --git a/source/blender/compositor/intern/COM_ExecutionGroup.cpp b/source/blender/compositor/intern/COM_ExecutionGroup.cpp
index 0ebf9af207b..481b83c81a3 100644
--- a/source/blender/compositor/intern/COM_ExecutionGroup.cpp
+++ b/source/blender/compositor/intern/COM_ExecutionGroup.cpp
@@ -57,7 +57,7 @@ ExecutionGroup::ExecutionGroup()
 	this->chunksFinished = 0;
 }
 
-int ExecutionGroup::getRenderPriotrity()
+CompositorPriority ExecutionGroup::getRenderPriotrity()
 {
 	return this->getOutputNodeOperation()->getRenderPriority();
 }
@@ -351,7 +351,8 @@ void ExecutionGroup::execute(ExecutionSystem *graph)
 				startIndex = index+1;
 			}
 		}
-		PIL_sleep_ms(10);
+
+		WorkScheduler::finish();
 
 		if (bTree->test_break && bTree->test_break(bTree->tbh)) {
 			breaked = true;
@@ -401,47 +402,11 @@ MemoryBuffer** ExecutionGroup::getInputBuffersOpenCL(int chunkNumber)
 	return memoryBuffers;
 }
 
-// @todo: for opencl the memory buffers size needs to be same as the needed size
-// currently this method is not called, but will be when opencl nodes are created
 MemoryBuffer *ExecutionGroup::constructConsolidatedMemoryBuffer(MemoryProxy *memoryProxy, rcti *rect)
 {
-	// find all chunks inside the rect
-	// determine minxchunk, minychunk, maxxchunk, maxychunk where x and y are chunknumbers
-	float chunkSizef = this->chunkSize;
-
-	int indexx, indexy;
-
-	const int minxchunk = floor(rect->xmin/chunkSizef);
-	const int maxxchunk = ceil((rect->xmax-1)/chunkSizef);
-	const int minychunk = floor(rect->ymin/chunkSizef);
-	const int maxychunk = ceil((rect->ymax-1)/chunkSizef);
-
-	if (maxxchunk== minxchunk+1 && maxychunk == minychunk+1) {
-		MemoryBuffer *result =memoryProxy->getBuffer();
-		return result;
-	}
-
-	rcti chunkRect;
-	chunkRect.xmin = minxchunk*this->chunkSize;
-	chunkRect.xmax = maxxchunk*this->chunkSize;
-	chunkRect.ymin = minychunk*this->chunkSize;
-	chunkRect.ymax = maxychunk*this->chunkSize;
-
-	CLAMP(chunkRect.xmin, 0, (int)this->width);
-	CLAMP(chunkRect.xmax, 0, (int)this->width);
-	CLAMP(chunkRect.ymin, 0, (int)this->height);
-	CLAMP(chunkRect.ymax, 0, (int)this->height);
-
-	MemoryBuffer *result = new MemoryBuffer(memoryProxy, &chunkRect);
-
-	for (indexx = max(minxchunk, 0); indexx<min((int)this->numberOfXChunks, maxxchunk) ; indexx++) {
-		for (indexy = max(minychunk, 0); indexy<min((int)this->numberOfYChunks, maxychunk) ; indexy++) {
-			/* int chunkNumber = indexx+indexy*this->numberOfXChunks; */ /* UNUSED */
-			MemoryBuffer *chunkBuffer = memoryProxy->getBuffer();
-			result->copyContentFrom(chunkBuffer);
-		}
-	}
-
+	MemoryBuffer* imageBuffer = memoryProxy->getBuffer();
+	MemoryBuffer* result = new MemoryBuffer(memoryProxy, rect);
+	result->copyContentFrom(imageBuffer);
 	return result;
 }
 
@@ -487,14 +452,14 @@ void ExecutionGroup::determineChunkRect(rcti *rect, const unsigned int chunkNumb
 
 MemoryBuffer *ExecutionGroup::allocateOutputBuffer(int chunkNumber, rcti *rect)
 {
-	MemoryBuffer *outputBuffer = NULL;
-	// output allocation is only valid when our outputoperation is a memorywriter
+	// we asume that this method is only called from complex execution groups.
 	NodeOperation * operation = this->getOutputNodeOperation();
 	if (operation->isWriteBufferOperation()) {
-/*		WriteBufferOperation *writeOperation = (WriteBufferOperation*)operation; */ /* UNUSED */
-// @todo		outputBuffer = MemoryManager::allocateMemoryBuffer(writeOperation->getMemoryProxy(), chunkNumber, rect);
+		WriteBufferOperation *writeOperation = (WriteBufferOperation*)operation;
+		MemoryBuffer *buffer = new MemoryBuffer(writeOperation->getMemoryProxy(), rect);
+		return buffer;
 	}
-	return outputBuffer;
+	return NULL;
 }
 
 
@@ -600,11 +565,6 @@ void ExecutionGroup::determineDependingMemoryProxies(vector<MemoryProxy*> *memor
 	}
 }
 
-bool ExecutionGroup::operator ==(const ExecutionGroup & executionGroup) const
-{
-	return this->getOutputNodeOperation() == executionGroup.getOutputNodeOperation();
-}
-
 bool ExecutionGroup::isOpenCL()
 {
 	return this->openCL;
diff --git a/source/blender/compositor/intern/COM_ExecutionGroup.h b/source/blender/compositor/intern/COM_ExecutionGroup.h
index cbdc9bb1787..416a78eb8b8 100644
--- a/source/blender/compositor/intern/COM_ExecutionGroup.h
+++ b/source/blender/compositor/intern/COM_ExecutionGroup.h
@@ -167,13 +167,7 @@ private:
 	  * @param operation the operation to be added
 	  */
 	bool canContainOperation(NodeOperation *operation);
-	
-	/**
-	  * @brief get the Render priority of this ExecutionGroup
-	  * @see ExecutionSystem.execute
-	  */
-	int getRenderPriotrity();
-	
+		
 	/**
 	  * @brief calculate the actual chunk size of this execution group.
 	  * @note A chunk size is an unsigned int that is both the height and width of a chunk.
@@ -396,17 +390,21 @@ public:
 	  * @see determineChunkSize()
 	  */
 	void determineChunkRect(rcti *rect, const unsigned int chunkNumber) const;
-	
-	
-		bool operator ==(const ExecutionGroup &executionGroup) const;
-	
-		/**
+
+	/**
 	  * @brief can this ExecutionGroup be scheduled on an OpenCLDevice
 	  * @see WorkScheduler.schedule
 	  */
 	bool isOpenCL();
-	
+
 	void setChunksize(int chunksize) {this->chunkSize = chunksize;}
+
+	/**
+	  * @brief get the Render priority of this ExecutionGroup
+	  * @see ExecutionSystem.execute
+	  */
+	CompositorPriority getRenderPriotrity();
+	
 };
 
 #endif
diff --git a/source/blender/compositor/intern/COM_ExecutionSystem.cpp b/source/blender/compositor/intern/COM_ExecutionSystem.cpp
index 96d2a6f4434..8c0b37a0685 100644
--- a/source/blender/compositor/intern/COM_ExecutionSystem.cpp
+++ b/source/blender/compositor/intern/COM_ExecutionSystem.cpp
@@ -127,20 +127,9 @@ void ExecutionSystem::execute()
 
 	WorkScheduler::start(this->context);
 
-
-	vector<ExecutionGroup*> executionGroups;
-	this->findOutputExecutionGroup(&executionGroups);
-
-	/* start execution of the ExecutionGroups based on priority of their output node */
-	for (int priority = 9 ; priority>=0 ; priority--) {
-		for (index = 0 ; index < executionGroups.size(); index ++) {
-			ExecutionGroup *group = executionGroups[index];
-			NodeOperation *output = group->getOutputNodeOperation();
-			if (output->getRenderPriority() == priority) {
-				group->execute(this);
-			}
-		}
-	}
+	executeGroups(COM_PRIORITY_HIGH);
+	executeGroups(COM_PRIORITY_MEDIUM);
+	executeGroups(COM_PRIORITY_LOW);
 
 	WorkScheduler::finish();
 	WorkScheduler::stop();
@@ -155,6 +144,18 @@ void ExecutionSystem::execute()
 	}
 }
 
+void ExecutionSystem::executeGroups(CompositorPriority priority)
+{
+	int index;
+	vector<ExecutionGroup*> executionGroups;
+	this->findOutputExecutionGroup(&executionGroups, priority);
+
+	for (index = 0 ; index < executionGroups.size(); index ++) {
+		ExecutionGroup *group = executionGroups[index];
+		group->execute(this);
+	}
+}
+
 void ExecutionSystem::addOperation(NodeOperation *operation)
 {
 	ExecutionSystemHelper::addOperation(this->operations, operation);
@@ -304,6 +305,17 @@ void ExecutionSystem::determineActualSocketDataTypes(vector<NodeBase*> &nodes)
 	}
 }
 
+void ExecutionSystem::findOutputExecutionGroup(vector<ExecutionGroup*> *result, CompositorPriority priority) const
+{
+	unsigned int index;
+	for (index = 0 ; index < this->groups.size() ; index ++) {
+		ExecutionGroup *group = this->groups[index];
+		if (group->isOutputExecutionGroup() && group->getRenderPriotrity() == priority) {
+			result->push_back(group);
+		}
+	}
+}
+
 void ExecutionSystem::findOutputExecutionGroup(vector<ExecutionGroup*> *result) const
 {
 	unsigned int index;
diff --git a/source/blender/compositor/intern/COM_ExecutionSystem.h b/source/blender/compositor/intern/COM_ExecutionSystem.h
index 85fec8b6145..510e58ba1bb 100644
--- a/source/blender/compositor/intern/COM_ExecutionSystem.h
+++ b/source/blender/compositor/intern/COM_ExecutionSystem.h
@@ -141,6 +141,11 @@ private: //methods
 	/**
 	  * find all execution group with output nodes
 	  */
+	void findOutputExecutionGroup(vector<ExecutionGroup*> *result, CompositorPriority priority) const;
+	
+	/**
+	  * find all execution group with output nodes
+	  */
 	void findOutputExecutionGroup(vector<ExecutionGroup*> *result) const;
 
 public:
@@ -224,6 +229,8 @@ private:
 	  * @param nodes list of nodes or operations to do the data type determination
 	  */
 	void determineActualSocketDataTypes(vector<NodeBase*> &nodes);
+	
+	void executeGroups(CompositorPriority priority);
 
 };
 #endif
diff --git a/source/blender/compositor/intern/COM_MemoryBuffer.cpp b/source/blender/compositor/intern/COM_MemoryBuffer.cpp
index 37d79607d12..3ebf8398c02 100644
--- a/source/blender/compositor/intern/COM_MemoryBuffer.cpp
+++ b/source/blender/compositor/intern/COM_MemoryBuffer.cpp
@@ -150,8 +150,8 @@ void MemoryBuffer::readCubic(float result[4], float x, float y)
 	
 	float valuex = x - x1;
 	float valuey = y - y1;
-	float mvaluex = 1.0 - valuex;
-	float mvaluey = 1.0 - valuey;
+	float mvaluex = 1.0f - valuex;
+	float mvaluey = 1.0f - valuey;
 	
 	float color1[4];
 	float color2[4];
diff --git a/source/blender/compositor/intern/COM_Node.cpp b/source/blender/compositor/intern/COM_Node.cpp
index ba5e21d53ae..264725b4b2c 100644
--- a/source/blender/compositor/intern/COM_Node.cpp
+++ b/source/blender/compositor/intern/COM_Node.cpp
@@ -83,23 +83,20 @@ void Node::addSetValueOperation(ExecutionSystem *graph, InputSocket *inputsocket
 	graph->addOperation(operation);
 }
 
-void Node::addPreviewOperation(ExecutionSystem *system, OutputSocket *outputSocket, int priority)
+void Node::addPreviewOperation(ExecutionSystem *system, OutputSocket *outputSocket)
 {
-#ifdef COM_PREVIEW_ENABLED
 	PreviewOperation *operation = new PreviewOperation();
 	system->addOperation(operation);
 	operation->setbNode(this->getbNode());
 	operation->setbNodeTree(system->getContext().getbNodeTree());
-	operation->setPriority(priority);
 	this->addLink(system, outputSocket, operation->getInputSocket(0));
-#endif
 }
 
-void Node::addPreviewOperation(ExecutionSystem *system, InputSocket *inputSocket, int priority)
+void Node::addPreviewOperation(ExecutionSystem *system, InputSocket *inputSocket)
 {
 	if (inputSocket->isConnected()) {
 		OutputSocket *outputsocket = inputSocket->getConnection()->getFromSocket();
-		this->addPreviewOperation(system, outputsocket, priority);
+		this->addPreviewOperation(system, outputsocket);
 	}
 }
 
diff --git a/source/blender/compositor/intern/COM_Node.h b/source/blender/compositor/intern/COM_Node.h
index 2666d0a6980..23744adf642 100644
--- a/source/blender/compositor/intern/COM_Node.h
+++ b/source/blender/compositor/intern/COM_Node.h
@@ -120,8 +120,8 @@ protected:
 	
 	Node();
 	
-	void addPreviewOperation(ExecutionSystem *system, InputSocket *inputSocket, int priority);
-	void addPreviewOperation(ExecutionSystem *system, OutputSocket *inputSocket, int priority);
+	void addPreviewOperation(ExecutionSystem *system, InputSocket *inputSocket);
+	void addPreviewOperation(ExecutionSystem *system, OutputSocket *outputSocket);
 	
 	bNodeSocket *getEditorInputSocket(int editorNodeInputSocketIndex);
 	bNodeSocket *getEditorOutputSocket(int editorNodeOutputSocketIndex);
diff --git a/source/blender/compositor/intern/COM_NodeOperation.cpp b/source/blender/compositor/intern/COM_NodeOperation.cpp
index fae652e39d7..650e4af5ae0 100644
--- a/source/blender/compositor/intern/COM_NodeOperation.cpp
+++ b/source/blender/compositor/intern/COM_NodeOperation.cpp
@@ -124,3 +124,111 @@ bool NodeOperation::determineDependingAreaOfInterest(rcti * input, ReadBufferOpe
 		return false;
 	}
 }
+
+cl_mem NodeOperation::COM_clAttachMemoryBufferToKernelParameter(cl_context context, cl_kernel kernel, int parameterIndex, int offsetIndex, list<cl_mem> *cleanup, MemoryBuffer **inputMemoryBuffers, SocketReader* reader) 
+{
+	cl_int error;
+	MemoryBuffer* result = (MemoryBuffer*)reader->initializeTileData(NULL, inputMemoryBuffers);
+
+	const cl_image_format imageFormat = {
+		CL_RGBA,
+		CL_FLOAT
+	};
+
+	cl_mem clBuffer = clCreateImage2D(context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, &imageFormat, result->getWidth(), 
+	                                  result->getHeight(), 0, result->getBuffer(), &error);
+	
+	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+	if (error == CL_SUCCESS) cleanup->push_back(clBuffer);
+
+	error = clSetKernelArg(kernel, parameterIndex, sizeof(cl_mem), &clBuffer);
+	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+	
+	COM_clAttachMemoryBufferOffsetToKernelParameter(kernel, offsetIndex, result);
+	return clBuffer;
+}
+	
+void NodeOperation::COM_clAttachMemoryBufferOffsetToKernelParameter(cl_kernel kernel, int offsetIndex, MemoryBuffer *memoryBuffer) 
+{
+	if (offsetIndex != -1) {
+		cl_int error;
+		rcti* rect = memoryBuffer->getRect();
+		cl_int2 offset = {rect->xmin, rect->ymin};
+
+		error = clSetKernelArg(kernel, offsetIndex, sizeof(cl_int2), &offset);
+		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+	}
+}
+
+void NodeOperation::COM_clAttachSizeToKernelParameter(cl_kernel kernel, int offsetIndex) 
+{
+	if (offsetIndex != -1) {
+		cl_int error;
+		cl_int2 offset = {this->getWidth(), this->getHeight()};
+
+		error = clSetKernelArg(kernel, offsetIndex, sizeof(cl_int2), &offset);
+		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+	}
+}
+
+void NodeOperation::COM_clAttachOutputMemoryBufferToKernelParameter(cl_kernel kernel, int parameterIndex, cl_mem clOutputMemoryBuffer) 
+{
+	cl_int error;
+	error = clSetKernelArg(kernel, parameterIndex, sizeof(cl_mem), &clOutputMemoryBuffer);
+	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
+}
+
+void NodeOperation::COM_clEnqueueRange(cl_command_queue queue, cl_kernel kernel, MemoryBuffer *outputMemoryBuffer) {
+	cl_int error;
+	const size_t size[] = {outputMemoryBuffer->getWidth(),outputMemoryBuffer->getHeight()};
+	
+	error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, size, 0, 0, 0, NULL);
+	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+}
+
+void NodeOperation::COM_clEnqueueRange(cl_command_queue queue, cl_kernel kernel, MemoryBuffer *outputMemoryBuffer, int offsetIndex) {
+	cl_int error;
+	const int width = outputMemoryBuffer->getWidth();
+	const int height = outputMemoryBuffer->getHeight();
+	int offsetx;
+	int offsety;
+	const int localSize = 32;
+	size_t size[2];
+	cl_int2 offset;
+	
+	for (offsety = 0 ; offsety < height; offsety+=localSize) {
+		offset[1] = offsety;
+		if (offsety+localSize < height) {
+			size[1] = localSize;
+		} else {
+			size[1] = height - offsety;
+		}
+		for (offsetx = 0 ; offsetx < width ; offsetx+=localSize) {
+			if (offsetx+localSize < width) {
+				size[0] = localSize;
+			} else {
+				size[0] = width - offsetx;
+			}
+			offset[0] = offsetx;
+
+			error = clSetKernelArg(kernel, offsetIndex, sizeof(cl_int2), &offset);
+			if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); }
+			error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, size, 0, 0, 0, NULL);
+			if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
+			clFlush(queue);
+		}
+	}
+}
+
+cl_kernel NodeOperation::COM_clCreateKernel(cl_program program, const char *kernelname, list<cl_kernel> *clKernelsToCleanUp) 
+{
+	cl_int error;
+	cl_kernel kernel = clCreateKernel(program, kernelname, &error)	;
+	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	
+	}
+	else {
+		if (clKernelsToCleanUp) clKernelsToCleanUp->push_back(kernel);
+	}
+	return kernel;
+	
+}
diff --git a/source/blender/compositor/intern/COM_NodeOperation.h b/source/blender/compositor/intern/COM_NodeOperation.h
index 73ba5b472d7..2219907b0c8 100644
--- a/source/blender/compositor/intern/COM_NodeOperation.h
+++ b/source/blender/compositor/intern/COM_NodeOperation.h
@@ -139,8 +139,10 @@ public:
 	  * @param rect the rectangle of the chunk (location and size)
 	  * @param chunkNumber the chunkNumber to be calculated
 	  * @param memoryBuffers all input MemoryBuffer's needed
+	  * @param outputBuffer the outputbuffer to write to
 	  */
-	virtual void executeOpenCLRegion(cl_context context, cl_program program, cl_command_queue queue, rcti *rect, unsigned int chunkNumber, MemoryBuffer** memoryBuffers) {}
+	virtual void executeOpenCLRegion(cl_context context, cl_program program, cl_command_queue queue, rcti *rect, 
+	                                 unsigned int chunkNumber, MemoryBuffer** memoryBuffers, MemoryBuffer* outputBuffer) {}
 
 	/**
 	  * @brief custom handle to add new tasks to the OpenCL command queue in order to execute a chunk on an GPUDevice
@@ -207,9 +209,9 @@ public:
 	/**
 	  * @brief get the render priority of this node.
 	  * @note only applicable for output operations like ViewerOperation
-	  * @return [0:9] 9 is highest priority
+	  * @return CompositorPriority
 	  */
-	virtual const int getRenderPriority() const {return 0;}
+	virtual const CompositorPriority getRenderPriority() const {return COM_PRIORITY_LOW;}
 
 	/**
 	  * @brief can this NodeOperation be scheduled on an OpenCLDevice
@@ -242,6 +244,13 @@ protected:
 	  */
 	void setOpenCL(bool openCL) {this->openCL = openCL;}
 
+	static cl_mem COM_clAttachMemoryBufferToKernelParameter(cl_context context, cl_kernel kernel, int parameterIndex, int offsetIndex, list<cl_mem> *cleanup, MemoryBuffer **inputMemoryBuffers, SocketReader* reader);
+	static void COM_clAttachMemoryBufferOffsetToKernelParameter(cl_kernel kernel, int offsetIndex, MemoryBuffer *memoryBuffers);
+	static void COM_clAttachOutputMemoryBufferToKernelParameter(cl_kernel kernel, int parameterIndex, cl_mem clOutputMemoryBuffer);
+	void COM_clAttachSizeToKernelParameter(cl_kernel kernel, int offsetIndex);
+	static void COM_clEnqueueRange(cl_command_queue queue, cl_kernel kernel, MemoryBuffer* outputMemoryBuffer);
+	static void COM_clEnqueueRange(cl_command_queue queue, cl_kernel kernel, MemoryBuffer *outputMemoryBuffer, int offsetIndex);
+	cl_kernel COM_clCreateKernel(cl_program program, const char* kernelname, list<cl_kernel> *clKernelsToCleanUp);
 
 };
 
diff --git a/source/blender/compositor/intern/COM_OpenCLDevice.cpp b/source/blender/compositor/intern/COM_OpenCLDevice.cpp
index 692b96f40b3..e6d3789b06d 100644
--- a/source/blender/compositor/intern/COM_OpenCLDevice.cpp
+++ b/source/blender/compositor/intern/COM_OpenCLDevice.cpp
@@ -56,10 +56,10 @@ void OpenCLDevice::execute(WorkPackage *work)
 	MemoryBuffer ** inputBuffers = executionGroup->getInputBuffersOpenCL(chunkNumber);
 	MemoryBuffer * outputBuffer = executionGroup->allocateOutputBuffer(chunkNumber, &rect);
 
-	executionGroup->getOutputNodeOperation()->executeOpenCLRegion(this->context, this->program, this->queue, &rect, chunkNumber, inputBuffers);
+	executionGroup->getOutputNodeOperation()->executeOpenCLRegion(this->context, this->program, this->queue, &rect, 
+	                                                              chunkNumber, inputBuffers, outputBuffer);
+
+	delete outputBuffer;
 	
 	executionGroup->finalizeChunkExecution(chunkNumber, inputBuffers);
-	if (outputBuffer != NULL) {
-		outputBuffer->setCreatedState();
-	}
 }
diff --git a/source/blender/compositor/intern/COM_WorkScheduler.cpp b/source/blender/compositor/intern/COM_WorkScheduler.cpp
index 80b91b2364c..ba8bfe55310 100644
--- a/source/blender/compositor/intern/COM_WorkScheduler.cpp
+++ b/source/blender/compositor/intern/COM_WorkScheduler.cpp
@@ -28,7 +28,7 @@
 #include "COM_OpenCLDevice.h"
 #include "OCL_opencl.h"
 #include "stdio.h"
-#include "COM_OpenCLKernels.cl.cpp"
+#include "COM_OpenCLKernels.cl.h"
 #include "BKE_global.h"
 
 #if COM_CURRENT_THREADING_MODEL == COM_TM_NOTHREAD
@@ -39,8 +39,6 @@
 #endif
 
 
-/// @brief global state of the WorkScheduler.
-static WorkSchedulerState state;
 /// @brief list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
 static vector<CPUDevice*> cpudevices;
 
@@ -68,43 +66,29 @@ static bool openclActive = false;
 #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
 void *WorkScheduler::thread_execute_cpu(void *data)
 {
-	bool continueLoop = true;
 	Device *device = (Device*)data;
-	while (continueLoop) {
-		WorkPackage *work = (WorkPackage*)BLI_thread_queue_pop(cpuqueue);
-		if (work) {
-			device->execute(work);
-			delete work;
-		}
-		PIL_sleep_ms(10);
-
-		if (WorkScheduler::isStopping()) {
-			continueLoop = false;
-		}
+	WorkPackage *work;
+	
+	while ((work = (WorkPackage*)BLI_thread_queue_pop(cpuqueue))) {
+		device->execute(work);
+		delete work;
 	}
+	
 	return NULL;
 }
 
 void *WorkScheduler::thread_execute_gpu(void *data)
 {
-	bool continueLoop = true;
 	Device *device = (Device*)data;
-	while (continueLoop) {
-		WorkPackage *work = (WorkPackage*)BLI_thread_queue_pop(gpuqueue);
-		if (work) {
-			device->execute(work);
-			delete work;
-		}
-		PIL_sleep_ms(10);
-
-		if (WorkScheduler::isStopping()) {
-			continueLoop = false;
-		}
+	WorkPackage *work;
+	
+	while ((work = (WorkPackage*)BLI_thread_queue_pop(gpuqueue))) {
+		device->execute(work);
+		delete work;
 	}
+	
 	return NULL;
 }
-
-bool WorkScheduler::isStopping() {return state == COM_WSS_STOPPING;}
 #endif
 
 
@@ -135,7 +119,6 @@ void WorkScheduler::start(CompositorContext &context)
 #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
 	unsigned int index;
 	cpuqueue = BLI_thread_queue_init();
-	BLI_thread_queue_nowait(cpuqueue);
 	BLI_init_threads(&cputhreads, thread_execute_cpu, cpudevices.size());
 	for (index = 0 ; index < cpudevices.size() ; index ++) {
 		Device *device = cpudevices[index];
@@ -144,7 +127,6 @@ void WorkScheduler::start(CompositorContext &context)
 #ifdef COM_OPENCL_ENABLED
 	if (context.getHasActiveOpenCLDevices()) {
 		gpuqueue = BLI_thread_queue_init();
-		BLI_thread_queue_nowait(gpuqueue);
 		BLI_init_threads(&gputhreads, thread_execute_gpu, gpudevices.size());
 		for (index = 0 ; index < gpudevices.size() ; index ++) {
 			Device *device = gpudevices[index];
@@ -157,45 +139,39 @@ void WorkScheduler::start(CompositorContext &context)
 	}
 #endif
 #endif
-	state = COM_WSS_STARTED;
 }
 void WorkScheduler::finish()
 {
 #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
 #ifdef COM_OPENCL_ENABLED
 	if (openclActive) {
-		while (BLI_thread_queue_size(gpuqueue) + BLI_thread_queue_size(cpuqueue) > 0) {
-			PIL_sleep_ms(10);
-		}
+		BLI_thread_queue_wait_finish(gpuqueue);
+		BLI_thread_queue_wait_finish(cpuqueue);
 	}
 	else {
-		while (BLI_thread_queue_size(cpuqueue) > 0) {
-			PIL_sleep_ms(10);
-		}
+		BLI_thread_queue_wait_finish(cpuqueue);
 	}
 #else
-	while (BLI_thread_queue_size(cpuqueue) > 0) {
-		PIL_sleep_ms(10);
-	}
+	BLI_thread_queue_wait_finish(cpuqueue);
 #endif
 #endif
 }
 void WorkScheduler::stop()
 {
-	state = COM_WSS_STOPPING;
 #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
+	BLI_thread_queue_nowait(cpuqueue);
 	BLI_end_threads(&cputhreads);
 	BLI_thread_queue_free(cpuqueue);
 	cpuqueue = NULL;
 #ifdef COM_OPENCL_ENABLED
 	if (openclActive) {
+		BLI_thread_queue_nowait(gpuqueue);
 		BLI_end_threads(&gputhreads);
 		BLI_thread_queue_free(gpuqueue);
 		gpuqueue = NULL;
 	}
 #endif
 #endif
-	state = COM_WSS_STOPPED;
 }
 
 bool WorkScheduler::hasGPUDevices()
@@ -218,8 +194,6 @@ extern void clContextError(const char *errinfo, const void *private_info, size_t
 
 void WorkScheduler::initialize()
 {
-	state = COM_WSS_UNKNOWN;
-
 #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE
 	int numberOfCPUThreads = BLI_system_thread_count();
 
@@ -260,7 +234,7 @@ void WorkScheduler::initialize()
 		if (totalNumberOfDevices > 0) {
 			context = clCreateContext(NULL, totalNumberOfDevices, cldevices, clContextError, NULL, &error);
 			if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));	}
-			program = clCreateProgramWithSource(context, 1, &sourcecode, 0, &error);
+			program = clCreateProgramWithSource(context, 1, &clkernelstoh_COM_OpenCLKernels_cl, 0, &error);
 			error = clBuildProgram(program, totalNumberOfDevices, cldevices, 0, 0, 0);
 			if (error != CL_SUCCESS) { 
 				cl_int error2;
@@ -298,8 +272,6 @@ void WorkScheduler::initialize()
 	}
 #endif
 #endif
-
-	state = COM_WSS_INITIALIZED;
 }
 
 void WorkScheduler::deinitialize()
@@ -329,5 +301,4 @@ void WorkScheduler::deinitialize()
 	}
 #endif
 #endif
-	state = COM_WSS_DEINITIALIZED;
 }
diff --git a/source/blender/compositor/intern/COM_WorkScheduler.h b/source/blender/compositor/intern/COM_WorkScheduler.h
index 0de1763749e..b03b514d139 100644
--- a/source/blender/compositor/intern/COM_WorkScheduler.h
+++ b/source/blender/compositor/intern/COM_WorkScheduler.h
@@ -31,19 +31,6 @@ extern "C" {
 #include "COM_defines.h"
 #include "COM_Device.h"
 
-// STATES
-/** @brief states of the WorkScheduler
-  * @ingroup execution
-  */
-typedef enum WorkSchedulerState {
-	COM_WSS_UNKNOWN = -1,
-	COM_WSS_INITIALIZED = 0,
-	COM_WSS_STARTED = 1,
-	COM_WSS_STOPPING = 2,
-	COM_WSS_STOPPED = 3,
-	COM_WSS_DEINITIALIZED = 4
-} WorkSchedulerState;
-
 /** @brief the workscheduler
   * @ingroup execution
   */