diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-01-29 16:29:41 +0300 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-01-31 17:48:15 +0300 |
commit | 4e576410b52577ff7d3274a94ae955fbb1c7e443 (patch) | |
tree | f30d4d6056ee64b815bcd9ec99b2122a933cf463 /decoder/LAVVideo/pixconv | |
parent | f2f72474d685df1c293996550d3ec2bbb7f38e2f (diff) |
dxva2cb: implement direct output mode
This reduces the performance overhead and CPU usage by up to 50%
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r-- | decoder/LAVVideo/pixconv/convert_direct.cpp | 90 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 6 |
2 files changed, 96 insertions, 0 deletions
diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp new file mode 100644 index 00000000..f61dc38a --- /dev/null +++ b/decoder/LAVVideo/pixconv/convert_direct.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2010-2015 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "stdafx.h" + +#include <emmintrin.h> + +#include "pixconv_internal.h" +#include "pixconv_sse2_templates.h" + +// This function is only designed for NV12-like pixel formats, like NV12, P010, P016, ... +DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4) +{ + const ptrdiff_t inStride = srcStride[0]; + const ptrdiff_t outStride = dstStride[0]; + const ptrdiff_t chromaHeight = (height >> 1); + + const ptrdiff_t byteWidth = (outputFormat == LAVOutPixFmt_P010 || outputFormat == LAVOutPixFmt_P016) ? width << 1 : width; + const ptrdiff_t stride = min(FFALIGN(byteWidth, 64), min(inStride, outStride)); + + __m128i xmm0,xmm1,xmm2,xmm3; + + _mm_sfence(); + + ptrdiff_t line, i; + + for (line = 0; line < height; line++) { + const uint8_t *y = (src[0] + line * inStride); + uint8_t *dy = (dst[0] + line * outStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, y + i + 0); + PIXCONV_STREAM_LOAD(xmm1, y + i + 16); + PIXCONV_STREAM_LOAD(xmm2, y + i + 32); + PIXCONV_STREAM_LOAD(xmm3, y + i + 48); + + _ReadWriteBarrier(); + + PIXCONV_PUT_STREAM(dy + i + 0, xmm0); + PIXCONV_PUT_STREAM(dy + i + 16, xmm1); + PIXCONV_PUT_STREAM(dy + i + 32, xmm2); + PIXCONV_PUT_STREAM(dy + i + 48, xmm3); + } + + for (; i < byteWidth; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, y + i); + PIXCONV_PUT_STREAM(dy + i, xmm0); + } + } + + for (line = 0; line < chromaHeight; line++) { + const uint8_t *uv = (src[1] + line * inStride); + uint8_t *duv = (dst[1] + line * outStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, uv + i + 0); + PIXCONV_STREAM_LOAD(xmm1, uv + i + 16); + PIXCONV_STREAM_LOAD(xmm2, uv + i + 32); + PIXCONV_STREAM_LOAD(xmm3, uv + i + 48); + + _ReadWriteBarrier(); + + PIXCONV_PUT_STREAM(duv + i + 0, xmm0); + PIXCONV_PUT_STREAM(duv + i + 16, xmm1); + PIXCONV_PUT_STREAM(duv + i + 32, xmm2); + PIXCONV_PUT_STREAM(duv + i + 48, xmm3); + } + + for (; i < byteWidth; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, uv + i); + PIXCONV_PUT_STREAM(duv + i, xmm0); + } + } + + return S_OK; +} diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index 6b913ad8..602b6905 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -97,6 +97,12 @@ #define PIXCONV_LOAD_ALIGNED(reg,src) \ reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ +// Load 128-bit into a register, using streaming memory access +// reg - register to store pixels in +// src - memory pointer of the source +#define PIXCONV_STREAM_LOAD(reg,src) \ + reg = _mm_stream_load_si128((__m128i *)(src)); /* load (streaming) */ + #define PIXCONV_LOAD_PIXEL8_ALIGNED PIXCONV_LOAD_ALIGNED // Put 128-bit into memory, using streaming write |