Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FFmpeg/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTimo Rothenpieler <timo@rothenpieler.org>2020-11-03 20:33:55 +0300
committerTimo Rothenpieler <timo@rothenpieler.org>2020-11-03 21:58:13 +0300
commit15c0e038ce90c3c1e13e80ea4fcf56c327b686f4 (patch)
tree1d3c607ba475e54ed7bb258c03903346cad101e6 /libavfilter/vf_scale_cuda.cu
parentf1d0f83712470c0fef13b8215cccbdb77ba7f3bf (diff)
avfilter/scale_cuda: code cleanup
Diffstat (limited to 'libavfilter/vf_scale_cuda.cu')
-rw-r--r--libavfilter/vf_scale_cuda.cu194
1 files changed, 32 insertions, 162 deletions
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index 3f3f40546d..24b1151215 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -20,12 +20,14 @@
* DEALINGS IN THE SOFTWARE.
*/
-extern "C" {
+#include "cuda/vector_helpers.cuh"
-__global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
- unsigned char *dst,
- int dst_width, int dst_height, int dst_pitch,
- int src_width, int src_height)
+template<typename T>
+__device__ inline void Subsample_Bilinear(cudaTextureObject_t tex,
+ T *dst,
+ int dst_width, int dst_height, int dst_pitch,
+ int src_width, int src_height,
+ int bit_depth)
{
int xo = blockIdx.x * blockDim.x + threadIdx.x;
int yo = blockIdx.y * blockDim.y + threadIdx.y;
@@ -42,170 +44,38 @@ __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
// Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
float dx = wh / (0.5f + wh);
float dy = wv / (0.5f + wv);
- int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy);
- int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy);
- int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy);
- int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy);
- dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
- }
-}
-
-__global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex,
- uchar2 *dst,
- int dst_width, int dst_height, int dst_pitch2,
- int src_width, int src_height)
-{
- int xo = blockIdx.x * blockDim.x + threadIdx.x;
- int yo = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (yo < dst_height && xo < dst_width)
- {
- float hscale = (float)src_width / (float)dst_width;
- float vscale = (float)src_height / (float)dst_height;
- float xi = (xo + 0.5f) * hscale;
- float yi = (yo + 0.5f) * vscale;
- // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
- float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
- float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
- // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
- float dx = wh / (0.5f + wh);
- float dy = wv / (0.5f + wv);
- uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy);
- uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy);
- uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy);
- uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy);
- int2 uv;
- uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
- uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
- dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y);
- }
-}
-
-__global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
- uchar4 *dst,
- int dst_width, int dst_height, int dst_pitch,
- int src_width, int src_height)
-{
- int xo = blockIdx.x * blockDim.x + threadIdx.x;
- int yo = blockIdx.y * blockDim.y + threadIdx.y;
- if (yo < dst_height && xo < dst_width)
- {
- float hscale = (float)src_width / (float)dst_width;
- float vscale = (float)src_height / (float)dst_height;
- float xi = (xo + 0.5f) * hscale;
- float yi = (yo + 0.5f) * vscale;
- // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
- float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
- float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
- // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
- float dx = wh / (0.5f + wh);
- float dy = wv / (0.5f + wv);
- uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy);
- uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy);
- uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy);
- uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy);
- int4 res;
- res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
- res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
- res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
- res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
- dst[yo*dst_pitch+xo] = make_uchar4(
- (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w);
+ intT r = { 0 };
+ vec_set_scalar(r, 2);
+ r += tex2D<T>(tex, xi - dx, yi - dy);
+ r += tex2D<T>(tex, xi + dx, yi - dy);
+ r += tex2D<T>(tex, xi - dx, yi + dy);
+ r += tex2D<T>(tex, xi + dx, yi + dy);
+ vec_set(dst[yo*dst_pitch+xo], r >> 2);
}
}
-__global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex,
- unsigned short *dst,
- int dst_width, int dst_height, int dst_pitch,
- int src_width, int src_height)
-{
- int xo = blockIdx.x * blockDim.x + threadIdx.x;
- int yo = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (yo < dst_height && xo < dst_width)
- {
- float hscale = (float)src_width / (float)dst_width;
- float vscale = (float)src_height / (float)dst_height;
- float xi = (xo + 0.5f) * hscale;
- float yi = (yo + 0.5f) * vscale;
- // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
- float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
- float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
- // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
- float dx = wh / (0.5f + wh);
- float dy = wv / (0.5f + wv);
- int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy);
- int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy);
- int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy);
- int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy);
- dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
- }
-}
-
-__global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex,
- ushort2 *dst,
- int dst_width, int dst_height, int dst_pitch2,
- int src_width, int src_height)
-{
- int xo = blockIdx.x * blockDim.x + threadIdx.x;
- int yo = blockIdx.y * blockDim.y + threadIdx.y;
+extern "C" {
- if (yo < dst_height && xo < dst_width)
- {
- float hscale = (float)src_width / (float)dst_width;
- float vscale = (float)src_height / (float)dst_height;
- float xi = (xo + 0.5f) * hscale;
- float yi = (yo + 0.5f) * vscale;
- // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
- float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
- float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
- // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
- float dx = wh / (0.5f + wh);
- float dy = wv / (0.5f + wv);
- ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy);
- ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy);
- ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy);
- ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy);
- int2 uv;
- uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
- uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
- dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y);
+#define BILINEAR_KERNEL(T) \
+ __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex, \
+ T *dst, \
+ int dst_width, int dst_height, int dst_pitch, \
+ int src_width, int src_height, \
+ int bit_depth) \
+ { \
+ Subsample_Bilinear<T>(src_tex, dst, \
+ dst_width, dst_height, dst_pitch, \
+ src_width, src_height, \
+ bit_depth); \
}
-}
-__global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex,
- ushort4 *dst,
- int dst_width, int dst_height, int dst_pitch,
- int src_width, int src_height)
-{
- int xo = blockIdx.x * blockDim.x + threadIdx.x;
- int yo = blockIdx.y * blockDim.y + threadIdx.y;
+BILINEAR_KERNEL(uchar)
+BILINEAR_KERNEL(uchar2)
+BILINEAR_KERNEL(uchar4)
- if (yo < dst_height && xo < dst_width)
- {
- float hscale = (float)src_width / (float)dst_width;
- float vscale = (float)src_height / (float)dst_height;
- float xi = (xo + 0.5f) * hscale;
- float yi = (yo + 0.5f) * vscale;
- // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
- float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
- float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
- // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
- float dx = wh / (0.5f + wh);
- float dy = wv / (0.5f + wv);
- ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy);
- ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy);
- ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy);
- ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy);
- int4 res;
- res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
- res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
- res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
- res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
- dst[yo*dst_pitch+xo] = make_ushort4(
- (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w);
- }
-}
+BILINEAR_KERNEL(ushort)
+BILINEAR_KERNEL(ushort2)
+BILINEAR_KERNEL(ushort4)
}