diff options
author | Timo Rothenpieler <timo@rothenpieler.org> | 2020-11-03 20:33:55 +0300 |
---|---|---|
committer | Timo Rothenpieler <timo@rothenpieler.org> | 2020-11-03 21:58:13 +0300 |
commit | 15c0e038ce90c3c1e13e80ea4fcf56c327b686f4 (patch) | |
tree | 1d3c607ba475e54ed7bb258c03903346cad101e6 /libavfilter/vf_scale_cuda.cu | |
parent | f1d0f83712470c0fef13b8215cccbdb77ba7f3bf (diff) |
avfilter/scale_cuda: code cleanup
Diffstat (limited to 'libavfilter/vf_scale_cuda.cu')
-rw-r--r-- | libavfilter/vf_scale_cuda.cu | 194 |
1 files changed, 32 insertions, 162 deletions
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index 3f3f40546d..24b1151215 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -20,12 +20,14 @@ * DEALINGS IN THE SOFTWARE. */ -extern "C" { +#include "cuda/vector_helpers.cuh" -__global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex, - unsigned char *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) +template<typename T> +__device__ inline void Subsample_Bilinear(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) { int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; @@ -42,170 +44,38 @@ __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex, // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} float dx = wh / (0.5f + wh); float dy = wv / (0.5f + wv); - int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy); - int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy); - int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy); - int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); - } -} - -__global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex, - uchar2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy); - uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy); - uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy); - uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y); - } -} - -__global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex, - uchar4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy); - uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy); - uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy); - uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_uchar4( - (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w); + intT r = { 0 }; + vec_set_scalar(r, 2); + r += tex2D<T>(tex, xi - dx, yi - dy); + r += tex2D<T>(tex, xi + dx, yi - dy); + r += tex2D<T>(tex, xi - dx, yi + dy); + r += tex2D<T>(tex, xi + dx, yi + dy); + vec_set(dst[yo*dst_pitch+xo], r >> 2); } } -__global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex, - unsigned short *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy); - int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy); - int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy); - int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); - } -} - -__global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex, - ushort2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +extern "C" { - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy); - ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy); - ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy); - ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y); +#define BILINEAR_KERNEL(T) \ + __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Bilinear<T>(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} -__global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex, - ushort4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +BILINEAR_KERNEL(uchar) +BILINEAR_KERNEL(uchar2) +BILINEAR_KERNEL(uchar4) - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy); - ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy); - ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy); - ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_ushort4( - (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w); - } -} +BILINEAR_KERNEL(ushort) +BILINEAR_KERNEL(ushort2) +BILINEAR_KERNEL(ushort4) } |