From 29f6616d609fbd92cf313b0fdec555c2fcb4ede0 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Fri, 30 Aug 2013 23:49:38 +0000 Subject: Cycles: viewport render now takes scene color management settings into account, except for curves, that's still missing from the OpenColorIO GLSL shader. The pixels are stored in a half float texture, converterd from full float with native GPU instructions and SIMD on the CPU, so it should be pretty quick. Using a GLSL shader is useful for GPU render because it avoids a copy through CPU memory. --- intern/cycles/blender/blender_session.cpp | 10 ++++- intern/cycles/blender/blender_sync.cpp | 3 ++ intern/cycles/device/device.cpp | 45 ++++++++++++++++----- intern/cycles/device/device_cpu.cpp | 67 ++++++++++++++++++++++--------- intern/cycles/device/device_cuda.cpp | 45 ++++++++++++++------- intern/cycles/device/device_memory.h | 9 ++++- intern/cycles/device/device_multi.cpp | 4 +- intern/cycles/device/device_network.cpp | 5 +-- intern/cycles/device/device_network.h | 8 ++-- intern/cycles/device/device_opencl.cpp | 32 ++++++++++----- intern/cycles/device/device_task.cpp | 2 +- intern/cycles/device/device_task.h | 5 ++- intern/cycles/kernel/kernel.cl | 33 +++++++++++++-- intern/cycles/kernel/kernel.cpp | 11 +++-- intern/cycles/kernel/kernel.cu | 13 +++++- intern/cycles/kernel/kernel.h | 18 ++++++--- intern/cycles/kernel/kernel_compat_cuda.h | 6 ++- intern/cycles/kernel/kernel_film.h | 23 ++++++++--- intern/cycles/kernel/kernel_sse2.cpp | 11 +++-- intern/cycles/kernel/kernel_sse3.cpp | 11 +++-- intern/cycles/render/buffers.cpp | 37 +++++++++++++---- intern/cycles/render/buffers.h | 16 +++++--- intern/cycles/render/session.cpp | 9 ++--- intern/cycles/render/session.h | 5 +++ intern/cycles/util/util_types.h | 64 +++++++++++++++++++++++++++++ 25 files changed, 378 insertions(+), 114 deletions(-) (limited to 'intern') diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 939eed2c77b..23390060118 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -588,7 +588,15 @@ bool BlenderSession::draw(int w, int h) /* draw */ BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_scene, b_v3d, b_rv3d, scene->camera, width, height); - return !session->draw(buffer_params); + if(session->params.display_buffer_linear) + b_engine.bind_display_space_shader(b_scene); + + bool draw_ok = !session->draw(buffer_params); + + if(session->params.display_buffer_linear) + b_engine.unbind_display_space_shader(); + + return draw_ok; } void BlenderSession::get_status(string& status, string& substatus) diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 4a686487462..58ce08665ef 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -492,6 +492,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine b_engine, BL::Use params.shadingsystem = SessionParams::SVM; else if(shadingsystem == 1) params.shadingsystem = SessionParams::OSL; + + /* color managagement */ + params.display_buffer_linear = b_engine.support_display_space_shader(b_scene); return params; } diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index e42f83be6ce..10d4112b57d 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -41,7 +41,10 @@ void Device::pixels_alloc(device_memory& mem) void Device::pixels_copy_from(device_memory& mem, int y, int w, int h) { - mem_copy_from(mem, y, w, h, sizeof(uint8_t)*4); + if(mem.data_type == TYPE_HALF) + mem_copy_from(mem, y, w, h, sizeof(half4)); + else + mem_copy_from(mem, y, w, h, sizeof(uchar4)); } void Device::pixels_free(device_memory& mem) @@ -53,27 +56,49 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dy, int w { pixels_copy_from(rgba, y, w, h); + GLuint texid; + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + if(rgba.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, (void*)rgba.data_pointer); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, (void*)rgba.data_pointer); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + glEnable(GL_TEXTURE_2D); + if(transparent) { glEnable(GL_BLEND); glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); } - glPixelZoom((float)width/(float)w, (float)height/(float)h); - glRasterPos2f(0, dy); + glColor3f(1.0f, 1.0f, 1.0f); - uint8_t *pixels = (uint8_t*)rgba.data_pointer; + glPushMatrix(); + glTranslatef(0.0f, (float)dy, 0.0f); - /* for multi devices, this assumes the ineffecient method that we allocate - * all pixels on the device even though we only render to a subset */ - pixels += 4*y*w; + glBegin(GL_QUADS); + + glTexCoord2f(0.0f, 0.0f); + glVertex2f(0.0f, 0.0f); + glTexCoord2f(1.0f, 0.0f); + glVertex2f((float)width, 0.0f); + glTexCoord2f(1.0f, 1.0f); + glVertex2f((float)width, (float)height); + glTexCoord2f(0.0f, 1.0f); + glVertex2f(0.0f, (float)height); - glDrawPixels(w, h, GL_RGBA, GL_UNSIGNED_BYTE, pixels); + glEnd(); - glRasterPos2f(0.0f, 0.0f); - glPixelZoom(1.0f, 1.0f); + glPopMatrix(); if(transparent) glDisable(GL_BLEND); + + glBindTexture(GL_TEXTURE_2D, 0); + glDisable(GL_TEXTURE_2D); + glDeleteTextures(1, &texid); } Device *Device::create(DeviceInfo& info, Stats &stats, bool background) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index d9c08dadbb0..b1dbdec9d36 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -127,8 +127,8 @@ public: { if(task->type == DeviceTask::PATH_TRACE) thread_path_trace(*task); - else if(task->type == DeviceTask::TONEMAP) - thread_tonemap(*task); + else if(task->type == DeviceTask::FILM_CONVERT) + thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) thread_shader(*task); } @@ -237,28 +237,55 @@ public: #endif } - void thread_tonemap(DeviceTask& task) + void thread_film_convert(DeviceTask& task) { + float sample_scale = 1.0f/(task.sample + 1); + + if(task.rgba_half) { #ifdef WITH_OPTIMIZED_KERNEL - if(system_cpu_support_sse3()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, - task.sample, x, y, task.offset, task.stride); - } - else if(system_cpu_support_sse2()) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, - task.sample, x, y, task.offset, task.stride); + if(system_cpu_support_sse3()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else +#endif + { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } } - else + else { +#ifdef WITH_OPTIMIZED_KERNEL + if(system_cpu_support_sse3()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else if(system_cpu_support_sse2()) { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } + else #endif - { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer, - task.sample, x, y, task.offset, task.stride); + { + for(int y = task.y; y < task.y + task.h; y++) + for(int x = task.x; x < task.x + task.w; x++) + kernel_cpu_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); + } } } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index c1b5a8bfcea..b5eaa69bf0e 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -625,7 +625,7 @@ public: cuda_pop_context(); } - void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba) + void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { if(have_error()) return; @@ -633,11 +633,14 @@ public: cuda_push_context(); CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels(rgba); + CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); CUdeviceptr d_buffer = cuda_device_ptr(buffer); /* get kernel function */ - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap")) + if(rgba_half) + cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")) + else + cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")) /* pass in parameters */ int offset = 0; @@ -648,11 +651,11 @@ public: cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer))) offset += sizeof(d_buffer); - int sample = task.sample; - offset = align_up(offset, __alignof(sample)); + float sample_scale = 1.0f/(task.sample + 1); + offset = align_up(offset, __alignof(sample_scale)); - cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample)) - offset += sizeof(task.sample); + cuda_assert(cuParamSetf(cuFilmConvert, offset, sample_scale)) + offset += sizeof(sample_scale); cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x)) offset += sizeof(task.x); @@ -684,7 +687,7 @@ public: cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1)) cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks)) - unmap_pixels(task.rgba); + unmap_pixels((rgba_byte)? rgba_byte: rgba_half); cuda_pop_context(); } @@ -771,13 +774,19 @@ public: glGenBuffers(1, &pmem.cuPBO); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW); + if(mem.data_type == TYPE_HALF) + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glGenTextures(1, &pmem.cuTexId); glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + if(mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_2D, 0); @@ -865,11 +874,19 @@ public: /* for multi devices, this assumes the ineffecient method that we allocate * all pixels on the device even though we only render to a subset */ - size_t offset = sizeof(uint8_t)*4*y*w; + size_t offset = 4*y*w; + + if(mem.data_type == TYPE_HALF) + offset *= sizeof(GLhalf); + else + offset *= sizeof(uint8_t); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO); glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset); + if(mem.data_type == TYPE_HALF) + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset); + else + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset); glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glEnable(GL_TEXTURE_2D); @@ -961,9 +978,9 @@ public: void task_add(DeviceTask& task) { - if(task.type == DeviceTask::TONEMAP) { + if(task.type == DeviceTask::FILM_CONVERT) { /* must be done in main thread due to opengl access */ - tonemap(task, task.buffer, task.rgba); + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); cuda_push_context(); cuda_assert(cuCtxSynchronize()) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index d27dd19cc96..18e6242d23d 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -46,7 +46,8 @@ enum DataType { TYPE_UCHAR, TYPE_UINT, TYPE_INT, - TYPE_FLOAT + TYPE_FLOAT, + TYPE_HALF }; static inline size_t datatype_size(DataType datatype) @@ -56,6 +57,7 @@ static inline size_t datatype_size(DataType datatype) case TYPE_FLOAT: return sizeof(float); case TYPE_UINT: return sizeof(uint); case TYPE_INT: return sizeof(int); + case TYPE_HALF: return sizeof(half); default: return 0; } } @@ -147,6 +149,11 @@ template<> struct device_type_traits { static const int num_elements = 4; }; +template<> struct device_type_traits { + static const DataType data_type = TYPE_HALF; + static const int num_elements = 4; +}; + /* Device Memory */ class device_memory diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index af6ca9e1fbd..4df0fdbd4c7 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -261,7 +261,6 @@ public: if(sub.device == sub_device) { if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer]; if(tile.rng_state) tile.rng_state = sub.ptr_map[tile.rng_state]; - if(tile.rgba) tile.rgba = sub.ptr_map[tile.rgba]; } } } @@ -290,7 +289,8 @@ public: tasks.pop_front(); if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer]; - if(task.rgba) subtask.rgba = sub.ptr_map[task.rgba]; + if(task.rgba_byte) subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; + if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half]; if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input]; if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output]; diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 521739b8ef1..23c1a10fa0a 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -408,7 +408,8 @@ protected: rcv.read(task); if(task.buffer) task.buffer = ptr_map[task.buffer]; - if(task.rgba) task.rgba = ptr_map[task.rgba]; + if(task.rgba_byte) task.rgba_byte = ptr_map[task.rgba_byte]; + if(task.rgba_half) task.rgba_half = ptr_map[task.rgba_half]; if(task.shader_input) task.shader_input = ptr_map[task.shader_input]; if(task.shader_output) task.shader_output = ptr_map[task.shader_output]; @@ -448,7 +449,6 @@ protected: if(tile.buffer) tile.buffer = ptr_map[tile.buffer]; if(tile.rng_state) tile.rng_state = ptr_map[tile.rng_state]; - if(tile.rgba) tile.rgba = ptr_map[tile.rgba]; result = true; break; @@ -478,7 +478,6 @@ protected: if(tile.buffer) tile.buffer = ptr_imap[tile.buffer]; if(tile.rng_state) tile.rng_state = ptr_imap[tile.rng_state]; - if(tile.rgba) tile.rgba = ptr_imap[tile.rgba]; RPCSend snd(socket, "release_tile"); snd.add(tile); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index 5fe574fd4d4..db399cf4240 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -94,7 +94,7 @@ public: int type = (int)task.type; archive & type & task.x & task.y & task.w & task.h; - archive & task.rgba & task.buffer & task.sample & task.num_samples; + archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; archive & task.offset & task.stride; archive & task.shader_input & task.shader_output & task.shader_eval_type; archive & task.shader_x & task.shader_w; @@ -105,7 +105,7 @@ public: archive & tile.x & tile.y & tile.w & tile.h; archive & tile.start_sample & tile.num_samples & tile.sample; archive & tile.offset & tile.stride; - archive & tile.buffer & tile.rng_state & tile.rgba; + archive & tile.buffer & tile.rng_state; } void write() @@ -234,7 +234,7 @@ public: int type; *archive & type & task.x & task.y & task.w & task.h; - *archive & task.rgba & task.buffer & task.sample & task.num_samples; + *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; *archive & task.resolution & task.offset & task.stride; *archive & task.shader_input & task.shader_output & task.shader_eval_type; *archive & task.shader_x & task.shader_w; @@ -247,7 +247,7 @@ public: *archive & tile.x & tile.y & tile.w & tile.h; *archive & tile.start_sample & tile.num_samples & tile.sample; *archive & tile.resolution & tile.offset & tile.stride; - *archive & tile.buffer & tile.rng_state & tile.rgba; + *archive & tile.buffer & tile.rng_state & tile.rgba_byte & tile.rgba_half; tile.buffers = NULL; } diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index e800b3f6442..d723df70c89 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -321,7 +321,8 @@ public: cl_device_id cdDevice; cl_program cpProgram; cl_kernel ckPathTraceKernel; - cl_kernel ckFilmConvertKernel; + cl_kernel ckFilmConvertByteKernel; + cl_kernel ckFilmConvertHalfFloatKernel; cl_kernel ckShaderKernel; cl_int ciErr; @@ -431,7 +432,8 @@ public: cqCommandQueue = NULL; cpProgram = NULL; ckPathTraceKernel = NULL; - ckFilmConvertKernel = NULL; + ckFilmConvertByteKernel = NULL; + ckFilmConvertHalfFloatKernel = NULL; ckShaderKernel = NULL; null_mem = 0; device_initialized = false; @@ -762,7 +764,11 @@ public: if(opencl_error(ciErr)) return false; - ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr); + ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr); + if(opencl_error(ciErr)) + return false; + + ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr); if(opencl_error(ciErr)) return false; @@ -788,8 +794,10 @@ public: if(ckPathTraceKernel) clReleaseKernel(ckPathTraceKernel); - if(ckFilmConvertKernel) - clReleaseKernel(ckFilmConvertKernel); + if(ckFilmConvertByteKernel) + clReleaseKernel(ckFilmConvertByteKernel); + if(ckFilmConvertHalfFloatKernel) + clReleaseKernel(ckFilmConvertHalfFloatKernel); if(cpProgram) clReleaseProgram(cpProgram); if(cqCommandQueue) @@ -980,17 +988,17 @@ public: return err; } - void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba) + void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = CL_MEM_PTR(rgba); + cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); cl_mem d_buffer = CL_MEM_PTR(buffer); cl_int d_x = task.x; cl_int d_y = task.y; cl_int d_w = task.w; cl_int d_h = task.h; - cl_int d_sample = task.sample; + cl_float d_sample_scale = 1.0f/(task.sample + 1); cl_int d_offset = task.offset; cl_int d_stride = task.stride; @@ -998,6 +1006,8 @@ public: cl_uint narg = 0; ciErr = 0; + cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel; + ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer); @@ -1006,7 +1016,7 @@ public: ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name); #include "kernel_textures.h" - ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample); + ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y); ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w); @@ -1052,8 +1062,8 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::TONEMAP) { - tonemap(*task, task->buffer, task->rgba); + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } else if(task->type == DeviceTask::SHADER) { shader(*task); diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 8c1e2920635..7d0eeab780d 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -27,7 +27,7 @@ CCL_NAMESPACE_BEGIN /* Device Task */ DeviceTask::DeviceTask(Type type_) -: type(type_), x(0), y(0), w(0), h(0), rgba(0), buffer(0), +: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0), sample(0), num_samples(1), shader_input(0), shader_output(0), shader_eval_type(0), shader_x(0), shader_w(0) diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index e232e128827..c1bd39b70ca 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -34,11 +34,12 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, TONEMAP, SHADER } Type; + typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; - device_ptr rgba; + device_ptr rgba_byte; + device_ptr rgba_half; device_ptr buffer; int sample; int num_samples; diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl index dd8ffdd2b33..28e72d78731 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernel.cl @@ -52,7 +52,7 @@ __kernel void kernel_ocl_path_trace( kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -__kernel void kernel_ocl_tonemap( +__kernel void kernel_ocl_convert_to_byte( __constant KernelData *data, __global uchar4 *rgba, __global float *buffer, @@ -61,7 +61,34 @@ __kernel void kernel_ocl_tonemap( __global type *name, #include "kernel_textures.h" - int sample, + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel_textures.h" + + int x = sx + get_global_id(0); + int y = sy + get_global_id(1); + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +__kernel void kernel_ocl_convert_to_half_float( + __constant KernelData *data, + __global uchar4 *rgba, + __global float *buffer, + +#define KERNEL_TEX(type, ttype, name) \ + __global type *name, +#include "kernel_textures.h" + + float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { KernelGlobals kglobals, *kg = &kglobals; @@ -76,7 +103,7 @@ __kernel void kernel_ocl_tonemap( int y = sy + get_global_id(1); if(x < sx + sw && y < sy + sh) - kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride); + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } __kernel void kernel_ocl_shader( diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp index 3f357763a8f..3e2727fde9a 100644 --- a/intern/cycles/kernel/kernel.cpp +++ b/intern/cycles/kernel/kernel.cpp @@ -96,11 +96,16 @@ void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_s kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -/* Tonemapping */ +/* Film */ -void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride) +void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) { - kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride); + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } /* Shader Evaluation */ diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu index c4da1d440b7..5e6748c66fc 100644 --- a/intern/cycles/kernel/kernel.cu +++ b/intern/cycles/kernel/kernel.cu @@ -44,13 +44,22 @@ extern "C" __global__ void kernel_cuda_branched_path_trace(float *buffer, uint * } #endif -extern "C" __global__ void kernel_cuda_tonemap(uchar4 *rgba, float *buffer, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +extern "C" __global__ void kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; if(x < sx + sw && y < sy + sh) - kernel_film_tonemap(NULL, rgba, buffer, sample, x, y, offset, stride); + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); } extern "C" __global__ void kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx) diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 6efc28ed2af..361f5b0856d 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -36,23 +36,29 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); -void kernel_cpu_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, - int sample, int x, int y, int offset, int stride); +void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #ifdef WITH_OPTIMIZED_KERNEL void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); -void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, - int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride); -void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, - int sample, int x, int y, int offset, int stride); +void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); +void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, + float sample_scale, int x, int y, int offset, int stride); void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i); #endif diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index a6bf4a96975..cb86ce8c4ae 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -25,8 +25,6 @@ #include #include -#include "util_types.h" - /* Qualifier wrappers for different names on different devices */ #define __device __device__ __inline__ @@ -41,6 +39,10 @@ #define kernel_assert(cond) +/* Types */ + +#include "util_types.h" + /* Textures */ typedef texture texture_float4; diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h index ba2149cc709..3ef33a2703b 100644 --- a/intern/cycles/kernel/kernel_film.h +++ b/intern/cycles/kernel/kernel_film.h @@ -16,9 +16,8 @@ CCL_NAMESPACE_BEGIN -__device float4 film_map(KernelGlobals *kg, float4 irradiance, int sample) +__device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale) { - float scale = 1.0f/(float)(sample+1); float exposure = kernel_data.film.exposure; float4 result = irradiance*scale; @@ -46,9 +45,9 @@ __device uchar4 film_float_to_byte(float4 color) return result; } -__device void kernel_film_tonemap(KernelGlobals *kg, +__device void kernel_film_convert_to_byte(KernelGlobals *kg, __global uchar4 *rgba, __global float *buffer, - int sample, int x, int y, int offset, int stride) + float sample_scale, int x, int y, int offset, int stride) { /* buffer offset */ int index = offset + x + y*stride; @@ -58,11 +57,25 @@ __device void kernel_film_tonemap(KernelGlobals *kg, /* map colors */ float4 irradiance = *((__global float4*)buffer); - float4 float_result = film_map(kg, irradiance, sample); + float4 float_result = film_map(kg, irradiance, sample_scale); uchar4 byte_result = film_float_to_byte(float_result); *rgba = byte_result; } +__device void kernel_film_convert_to_half_float(KernelGlobals *kg, + __global uchar4 *rgba, __global float *buffer, + float sample_scale, int x, int y, int offset, int stride) +{ + /* buffer offset */ + int index = offset + x + y*stride; + + float4 *in = (__global float4*)(buffer + index*kernel_data.film.pass_stride); + half *out = (half*)rgba + index*4; + float scale = kernel_data.film.exposure*sample_scale; + + float4_store_half(out, in, scale); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp index e0413ddf445..862626d6899 100644 --- a/intern/cycles/kernel/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernel_sse2.cpp @@ -45,11 +45,16 @@ void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int * kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -/* Tonemapping */ +/* Film */ -void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride) +void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) { - kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride); + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } /* Shader Evaluate */ diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp index 86f4705ca18..c44098606a5 100644 --- a/intern/cycles/kernel/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernel_sse3.cpp @@ -47,11 +47,16 @@ void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int * kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -/* Tonemapping */ +/* Film */ -void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer, int sample, int x, int y, int offset, int stride) +void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) { - kernel_film_tonemap(kg, rgba, buffer, sample, x, y, offset, stride); + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); +} + +void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride) +{ + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } /* Shader Evaluate */ diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index e0bc3f40c4e..5fb648cec5f 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -91,7 +91,6 @@ RenderTile::RenderTile() buffer = 0; rng_state = 0; - rgba = 0; buffers = NULL; } @@ -298,12 +297,13 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int /* Display Buffer */ -DisplayBuffer::DisplayBuffer(Device *device_) +DisplayBuffer::DisplayBuffer(Device *device_, bool linear) { device = device_; draw_width = 0; draw_height = 0; transparent = true; /* todo: determine from background */ + half_float = linear; } DisplayBuffer::~DisplayBuffer() @@ -313,9 +313,13 @@ DisplayBuffer::~DisplayBuffer() void DisplayBuffer::device_free() { - if(rgba.device_pointer) { - device->pixels_free(rgba); - rgba.clear(); + if(rgba_byte.device_pointer) { + device->pixels_free(rgba_byte); + rgba_byte.clear(); + } + if(rgba_half.device_pointer) { + device->pixels_free(rgba_half); + rgba_half.clear(); } } @@ -330,8 +334,14 @@ void DisplayBuffer::reset(Device *device, BufferParams& params_) device_free(); /* allocate display pixels */ - rgba.resize(params.width, params.height); - device->pixels_alloc(rgba); + if(half_float) { + rgba_half.resize(params.width, params.height); + device->pixels_alloc(rgba_half); + } + else { + rgba_byte.resize(params.width, params.height); + device->pixels_alloc(rgba_byte); + } } void DisplayBuffer::draw_set(int width, int height) @@ -347,6 +357,7 @@ void DisplayBuffer::draw(Device *device) if(draw_width != 0 && draw_height != 0) { glPushMatrix(); glTranslatef(params.full_x, params.full_y, 0.0f); + device_memory& rgba = rgba_data(); device->draw_pixels(rgba, 0, draw_width, draw_height, 0, params.width, params.height, transparent); @@ -366,8 +377,12 @@ void DisplayBuffer::write(Device *device, const string& filename) if(w == 0 || h == 0) return; + + if(half_float) + return; /* read buffer from device */ + device_memory& rgba = rgba_data(); device->pixels_copy_from(rgba, 0, w, h); /* write image */ @@ -389,5 +404,13 @@ void DisplayBuffer::write(Device *device, const string& filename) delete out; } +device_memory& DisplayBuffer::rgba_data() +{ + if(half_float) + return rgba_half; + else + return rgba_byte; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 2936a224d43..0b1f9010e75 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -87,8 +87,8 @@ protected: /* Display Buffer * - * The buffer used for drawing during render, filled by tonemapping the render - * buffers and converting to uchar4 storage. */ + * The buffer used for drawing during render, filled by converting the render + * buffers to byte of half float storage */ class DisplayBuffer { public: @@ -100,10 +100,13 @@ public: int draw_width, draw_height; /* draw alpha channel? */ bool transparent; - /* byte buffer for tonemapped result */ - device_vector rgba; + /* use half float? */ + bool half_float; + /* byte buffer for converted result */ + device_vector rgba_byte; + device_vector rgba_half; - DisplayBuffer(Device *device); + DisplayBuffer(Device *device, bool linear = false); ~DisplayBuffer(); void reset(Device *device, BufferParams& params); @@ -113,6 +116,8 @@ public: void draw(Device *device); bool draw_ready(); + device_memory& rgba_data(); + protected: void device_free(); @@ -134,7 +139,6 @@ public: device_ptr buffer; device_ptr rng_state; - device_ptr rgba; RenderBuffers *buffers; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 01f8a950c8f..d18223d7ab3 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -56,7 +56,7 @@ Session::Session(const SessionParams& params_) } else { buffers = new RenderBuffers(device); - display = new DisplayBuffer(device); + display = new DisplayBuffer(device, params.display_buffer_linear); } session_thread = NULL; @@ -371,7 +371,6 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) rtile.buffer = buffers->buffer.device_pointer; rtile.rng_state = buffers->rng_state.device_pointer; - rtile.rgba = display->rgba.device_pointer; rtile.buffers = buffers; device->map_tile(tile_device, rtile); @@ -415,7 +414,6 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) rtile.buffer = tilebuffers->buffer.device_pointer; rtile.rng_state = tilebuffers->rng_state.device_pointer; - rtile.rgba = 0; rtile.buffers = tilebuffers; /* this will tag tile as IN PROGRESS in blender-side render pipeline, @@ -838,13 +836,14 @@ void Session::path_trace() void Session::tonemap() { /* add tonemap task */ - DeviceTask task(DeviceTask::TONEMAP); + DeviceTask task(DeviceTask::FILM_CONVERT); task.x = tile_manager.state.buffer.full_x; task.y = tile_manager.state.buffer.full_y; task.w = tile_manager.state.buffer.width; task.h = tile_manager.state.buffer.height; - task.rgba = display->rgba.device_pointer; + task.rgba_byte = display->rgba_byte.device_pointer; + task.rgba_half = display->rgba_half.device_pointer; task.buffer = buffers->buffer.device_pointer; task.sample = tile_manager.state.sample; tile_manager.state.buffer.get_offset_stride(task.offset, task.stride); diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 0874bfee780..8cff64def4e 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -53,6 +53,8 @@ public: int start_resolution; int threads; + bool display_buffer_linear; + double cancel_timeout; double reset_timeout; double text_timeout; @@ -72,6 +74,8 @@ public: start_resolution = INT_MAX; threads = 0; + display_buffer_linear = false; + cancel_timeout = 0.1; reset_timeout = 0.1; text_timeout = 1.0; @@ -91,6 +95,7 @@ public: && tile_size == params.tile_size && start_resolution == params.start_resolution && threads == params.threads + && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout && text_timeout == params.text_timeout diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 758f39a76b2..f48fd1e124b 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -541,6 +541,70 @@ template __device_inline const __m12 } #endif +/* Half Floats */ + +#ifdef __KERNEL_OPENCL__ + +__device_inline void float4_store_half(half *h, const float4 *f, float scale) +{ + vstore_half4(*f * scale, 0, h); +} + +#else + +typedef unsigned short half; +struct half4 { half x, y, z, w; }; + +#ifdef __KERNEL_CUDA__ + +__device_inline void float4_store_half(half *h, const float4 *f, float scale) +{ + h[0] = __float2half_rn(f->x * scale); + h[1] = __float2half_rn(f->y * scale); + h[2] = __float2half_rn(f->z * scale); + h[3] = __float2half_rn(f->w * scale); +} + +#else + +__device_inline void float4_store_half(half *h, const float4 *f, float scale) +{ +#ifndef __KERNEL_SSE2__ + for(int i = 0; i < 4; i++) { + /* optimized float to half for pixels: + * assumes no negative, no nan, no inf, and sets denormal to 0 */ + union { uint i; float f; } in; + in.f = ((*f)[i] > 0.0f)? (*f)[i] * scale: 0.0f; + int x = in.i; + + int absolute = x & 0x7FFFFFFF; + int Z = absolute + 0xC8000000; + int result = (absolute < 0x38800000)? 0: Z; + + h[i] = ((result >> 13) & 0x7FFF); + } +#else + /* same as above with SSE */ + const __m128 mm_scale = _mm_set_ps1(scale); + const __m128i mm_38800000 = _mm_set1_epi32(0x38800000); + const __m128i mm_7FFF = _mm_set1_epi32(0x7FFF); + const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF); + const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000); + + __m128i x = _mm_castps_si128(_mm_max_ps(_mm_mul_ps(*(__m128*)f, mm_scale), _mm_set_ps1(0.0f))); + __m128i absolute = _mm_and_si128(x, mm_7FFFFFFF); + __m128i Z = _mm_add_epi32(absolute, mm_C8000000); + __m128i result = _mm_andnot_si128(_mm_cmplt_epi32(absolute, mm_38800000), Z); + __m128i rh = _mm_and_si128(_mm_srai_epi32(result, 13), mm_7FFF); + + _mm_storel_pi((__m64*)h, _mm_castsi128_ps(_mm_packs_epi32(rh, rh))); +#endif +} + +#endif + +#endif + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_H__ */ -- cgit v1.2.3