diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-01-29 16:29:41 +0300 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-01-31 17:48:15 +0300 |
commit | 4e576410b52577ff7d3274a94ae955fbb1c7e443 (patch) | |
tree | f30d4d6056ee64b815bcd9ec99b2122a933cf463 /decoder | |
parent | f2f72474d685df1c293996550d3ec2bbb7f38e2f (diff) |
dxva2cb: implement direct output mode
This reduces the performance overhead and CPU usage by up to 50%
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/LAVVideo/DecodeThread.h | 1 | ||||
-rw-r--r-- | decoder/LAVVideo/Filtering.cpp | 3 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVPixFmtConverter.cpp | 55 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVPixFmtConverter.h | 8 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVVideo.cpp | 78 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVVideo.h | 3 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVVideo.vcxproj | 1 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVVideo.vcxproj.filters | 3 | ||||
-rw-r--r-- | decoder/LAVVideo/decoders/DecBase.h | 2 | ||||
-rw-r--r-- | decoder/LAVVideo/decoders/ILAVDecoder.h | 14 | ||||
-rw-r--r-- | decoder/LAVVideo/decoders/dxva2dec.cpp | 63 | ||||
-rw-r--r-- | decoder/LAVVideo/decoders/dxva2dec.h | 7 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/convert_direct.cpp | 90 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 6 |
14 files changed, 324 insertions, 10 deletions
diff --git a/decoder/LAVVideo/DecodeThread.h b/decoder/LAVVideo/DecodeThread.h index 0d80d752..580880bd 100644 --- a/decoder/LAVVideo/DecodeThread.h +++ b/decoder/LAVVideo/DecodeThread.h @@ -37,6 +37,7 @@ public: STDMETHODIMP GetPixelFormat(LAVPixelFormat *pPix, int *pBpp) { ASSERT(m_pDecoder); return m_pDecoder->GetPixelFormat(pPix, pBpp); } STDMETHODIMP_(REFERENCE_TIME) GetFrameDuration() { ASSERT(m_pDecoder); return m_pDecoder->GetFrameDuration(); } STDMETHODIMP HasThreadSafeBuffers() { return m_pDecoder ? m_pDecoder->HasThreadSafeBuffers() : S_FALSE; } + STDMETHODIMP SetDirectOutput(BOOL bDirect) { return m_pDecoder ? m_pDecoder->SetDirectOutput(bDirect) : S_FALSE; } STDMETHODIMP CreateDecoder(const CMediaType *pmt, AVCodecID codec); diff --git a/decoder/LAVVideo/Filtering.cpp b/decoder/LAVVideo/Filtering.cpp index c07ec49e..05187e23 100644 --- a/decoder/LAVVideo/Filtering.cpp +++ b/decoder/LAVVideo/Filtering.cpp @@ -128,6 +128,9 @@ HRESULT CLAVVideo::Filter(LAVFrame *pFrame) if (!m_pFilterGraph) goto deliver; + if (pFrame->direct) + DeDirectFrame(pFrame, true); + AVFrame *in_frame = nullptr; BOOL refcountedFrame = (m_Decoder.HasThreadSafeBuffers() == S_OK); // When flushing, we feed a NULL frame diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp index d07de862..a1e18387 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.cpp +++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp @@ -139,6 +139,7 @@ static LAV_INOUT_PIXFMT_MAP *lookupFormatMap(LAVPixelFormat informat, int bpp, B CLAVPixFmtConverter::CLAVPixFmtConverter() { convert = &CLAVPixFmtConverter::convert_generic; + convert_direct = nullptr; m_NumThreads = min(8, max(1, av_cpu_count() / 2)); @@ -398,6 +399,27 @@ void CLAVPixFmtConverter::SelectConvertFunction() if (convert == nullptr) { convert = &CLAVPixFmtConverter::convert_generic; } + + SelectConvertFunctionDirect(); +} + +void CLAVPixFmtConverter::SelectConvertFunctionDirect() +{ + convert_direct = nullptr; + m_bDirectMode = FALSE; + + int cpu = av_get_cpu_flags(); + if (m_InputPixFmt == LAVPixFmt_NV12 && m_OutputPixFmt == LAVOutPixFmt_NV12) { + if (cpu & AV_CPU_FLAG_SSE4) + convert_direct = &CLAVPixFmtConverter::plane_copy_direct_sse4; + else if (cpu & AV_CPU_FLAG_SSE2) + convert_direct = &CLAVPixFmtConverter::plane_copy_sse2; + else + convert_direct = &CLAVPixFmtConverter::plane_copy; + } + + if (convert_direct != nullptr) + m_bDirectMode = true; } HRESULT CLAVPixFmtConverter::Convert(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight) { @@ -439,6 +461,39 @@ HRESULT CLAVPixFmtConverter::Convert(LAVFrame *pFrame, uint8_t *dst, int width, return hr; } +BOOL CLAVPixFmtConverter::IsDirectModeSupported(uintptr_t dst, ptrdiff_t stride) { + if (FFALIGN(stride, 16) != stride || (dst % 16u)) + return false; + return m_bDirectMode; +} + +HRESULT CLAVPixFmtConverter::ConvertDirect(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight) +{ + HRESULT hr = S_OK; + planeHeight = max(height, planeHeight); + ASSERT(pFrame->direct && pFrame->direct_lock && pFrame->direct_unlock); + + LAVDirectBuffer buffer; + if (pFrame->direct_lock(pFrame, &buffer)) { + uint8_t *dstArray[4] = { 0 }; + ptrdiff_t dstStrideArray[4] = { 0 }; + ptrdiff_t byteStride = dstStride * lav_pixfmt_desc[m_OutputPixFmt].codedbytes; + + dstArray[0] = dst; + dstStrideArray[0] = byteStride; + + for (int i = 1; i < lav_pixfmt_desc[m_OutputPixFmt].planes; ++i) { + dstArray[i] = dstArray[i - 1] + dstStrideArray[i - 1] * (planeHeight / lav_pixfmt_desc[m_OutputPixFmt].planeHeight[i - 1]); + dstStrideArray[i] = byteStride / lav_pixfmt_desc[m_OutputPixFmt].planeWidth[i]; + } + + hr = (this->*convert_direct)(buffer.data, buffer.stride, dstArray, dstStrideArray, width, height, m_InputPixFmt, m_InBpp, m_OutputPixFmt); + pFrame->direct_unlock(pFrame); + } + + return hr; +} + void CLAVPixFmtConverter::ChangeStride(const uint8_t* src, ptrdiff_t srcStride, uint8_t *dst, ptrdiff_t dstStride, int width, int height, int planeHeight, LAVOutPixFmts format) { LAVOutPixFmtDesc desc = lav_pixfmt_desc[format]; diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h index f50a8655..3ab7d9b0 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.h +++ b/decoder/LAVVideo/LAVPixFmtConverter.h @@ -76,8 +76,10 @@ public: BOOL IsAllowedSubtype(const GUID *guid); HRESULT Convert(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight); + HRESULT ConvertDirect(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight); BOOL IsRGBConverterActive() { return m_bRGBConverter; } + BOOL IsDirectModeSupported(uintptr_t dst, ptrdiff_t stride); DWORD GetImageSize(int width, int height, LAVOutPixFmts pixFmt = LAVOutPixFmt_None); @@ -90,6 +92,7 @@ private: LAVOutPixFmts GetFilteredFormat(int index); void SelectConvertFunction(); + void SelectConvertFunctionDirect(); // Helper functions for convert_generic HRESULT swscale_scale(enum AVPixelFormat srcPix, enum AVPixelFormat dstPix, const uint8_t* const src[4], const ptrdiff_t srcStride[4], uint8_t* dst[4], int width, int height, const ptrdiff_t dstStride[4], LAVOutPixFmtDesc pixFmtDesc, bool swapPlanes12 = false); @@ -110,6 +113,7 @@ private: // Conversion function pointer ConverterFn convert; + ConverterFn convert_direct; // Pixel Implementations DECLARE_CONV_FUNC(convert_generic); @@ -130,6 +134,8 @@ private: DECLARE_CONV_FUNC(convert_rgb48_rgb32_ssse3); template <int out32> DECLARE_CONV_FUNC(convert_rgb48_rgb); + DECLARE_CONV_FUNC(plane_copy_direct_sse4); + DECLARE_CONV_FUNC(convert_yuv_rgb); const RGBCoeffs* getRGBCoeffs(int width, int height); void InitRGBConvDispatcher(); @@ -141,6 +147,8 @@ private: LAVOutPixFmts m_OutputPixFmt = LAVOutPixFmt_YV12; int m_InBpp = 0; + BOOL m_bDirectMode = false; + int swsWidth = 0; int swsHeight = 0; int swsOutputRange = 0; diff --git a/decoder/LAVVideo/LAVVideo.cpp b/decoder/LAVVideo/LAVVideo.cpp index fb318cd8..31434e7c 100644 --- a/decoder/LAVVideo/LAVVideo.cpp +++ b/decoder/LAVVideo/LAVVideo.cpp @@ -675,6 +675,25 @@ done: return SUCCEEDED(hr) ? S_OK : VFW_E_TYPE_NOT_ACCEPTED; } +HRESULT CLAVVideo::CheckDirectMode() +{ + LAVPixelFormat pix; + int bpp; + m_Decoder.GetPixelFormat(&pix, &bpp); + + BOOL bDirect = (pix == LAVPixFmt_NV12); + if (m_Decoder.IsInterlaced() && m_settings.SWDeintMode == SWDeintMode_YADIF) + bDirect = FALSE; + else if (m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_NV12) + bDirect = FALSE; + else if (m_SubtitleConsumer && m_SubtitleConsumer->HasProvider()) + bDirect = FALSE; + + m_Decoder.SetDirectOutput(bDirect); + + return S_OK; +} + HRESULT CLAVVideo::SetMediaType(PIN_DIRECTION dir, const CMediaType *pmt) { HRESULT hr = S_OK; @@ -799,6 +818,9 @@ HRESULT CLAVVideo::CompleteConnect(PIN_DIRECTION dir, IPin *pReceivePin) HRESULT hr = S_OK; if (dir == PINDIR_OUTPUT) { hr = m_Decoder.PostConnect(pReceivePin); + if (SUCCEEDED(hr)) { + CheckDirectMode(); + } } else if (dir == PINDIR_INPUT) { if (m_pInput->CurrentMediaType().subtype == MEDIASUBTYPE_MPEG2_VIDEO && !m_pSubtitleInput && (m_dwDecodeFlags & LAV_VIDEO_DEC_FLAG_DVD)) { m_pSubtitleInput = new CLAVVideoSubtitleInputPin(TEXT("CLAVVideoSubtitleInputPin"), this, &m_csFilter, &hr, L"Subtitle Input"); @@ -1277,6 +1299,46 @@ STDMETHODIMP CLAVVideo::ReleaseFrame(LAVFrame **ppFrame) return S_OK; } +HRESULT CLAVVideo::DeDirectFrame(LAVFrame *pFrame, bool bDisableDirectMode) +{ + if (!pFrame->direct) + return S_FALSE; + + ASSERT(pFrame->direct_lock && pFrame->direct_unlock); + + LAVPixFmtDesc desc = getPixelFormatDesc(pFrame->format); + + LAVFrame tmpFrame = *pFrame; + pFrame->destruct = nullptr; + pFrame->priv_data = nullptr; + pFrame->direct = false; + pFrame->direct_lock = nullptr; + pFrame->direct_unlock = nullptr; + memset(pFrame->data, 0, sizeof(pFrame->data)); + + LAVDirectBuffer buffer; + if (tmpFrame.direct_lock(&tmpFrame, &buffer)) { + AllocLAVFrameBuffers(pFrame, buffer.stride[0] / desc.codedbytes); + + // use slow copy, this should only be used extremely rarely + memcpy(pFrame->data[0], buffer.data[0], pFrame->height * buffer.stride[0]); + for (int i = 1; i < desc.planes; i++) + memcpy(pFrame->data[i], buffer.data[i], (pFrame->height / desc.planeHeight[i]) * buffer.stride[i]); + + tmpFrame.direct_unlock(&tmpFrame); + } else { + // fallack, alloc anyway so nothing blows up + AllocLAVFrameBuffers(pFrame); + } + + FreeLAVFrameBuffers(&tmpFrame); + + if (bDisableDirectMode) + m_Decoder.SetDirectOutput(false); + + return S_OK; +} + STDMETHODIMP_(LAVFrame*) CLAVVideo::GetFlushFrame() { LAVFrame *pFlushFrame = nullptr; @@ -1367,6 +1429,7 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame) if (pFrame->format != LAVPixFmt_DXVA2) { ReleaseFrame(&m_pLastSequenceFrame); if ((pFrame->flags & LAV_FRAME_FLAG_END_OF_SEQUENCE || m_bInDVDMenu)) { + if (pFrame->direct) DeDirectFrame(pFrame, false); CopyLAVFrame(pFrame, &m_pLastSequenceFrame); } } else { @@ -1446,8 +1509,10 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame) if (m_SubtitleConsumer && m_SubtitleConsumer->HasProvider()) { m_SubtitleConsumer->SetVideoSize(width, height); m_SubtitleConsumer->RequestFrame(pFrame->rtStart, pFrame->rtStop); - if (!bRGBOut) + if (!bRGBOut) { + if (pFrame->direct) DeDirectFrame(pFrame, true); m_SubtitleConsumer->ProcessFrame(pFrame); + } } // Grab a media sample, and start assembling the data for it. @@ -1494,7 +1559,16 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame) QueryPerformanceFrequency(&frequency); QueryPerformanceCounter(&start); #endif - m_PixFmtConverter.Convert(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight)); + + if (pFrame->direct && !m_PixFmtConverter.IsDirectModeSupported((uintptr_t)pDataOut, pBIH->biWidth)) { + DeDirectFrame(pFrame, true); + } + + if (pFrame->direct) + m_PixFmtConverter.ConvertDirect(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight)); + else + m_PixFmtConverter.Convert(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight)); + #if defined(DEBUG) && DEBUG_PIXELCONV_TIMINGS QueryPerformanceCounter(&end); double diff = (end.QuadPart - start.QuadPart) * 1000.0 / frequency.QuadPart; diff --git a/decoder/LAVVideo/LAVVideo.h b/decoder/LAVVideo/LAVVideo.h index 83579bd8..7fd834c9 100644 --- a/decoder/LAVVideo/LAVVideo.h +++ b/decoder/LAVVideo/LAVVideo.h @@ -192,6 +192,9 @@ private: HRESULT NegotiatePixelFormat(CMediaType &mt, int width, int height); BOOL IsInterlaced(); + HRESULT CheckDirectMode(); + HRESULT DeDirectFrame(LAVFrame *pFrame, bool bDisableDirectMode = true); + HRESULT Filter(LAVFrame *pFrame); HRESULT DeliverToRenderer(LAVFrame *pFrame); diff --git a/decoder/LAVVideo/LAVVideo.vcxproj b/decoder/LAVVideo/LAVVideo.vcxproj index 6c2ed16c..f0b98e4c 100644 --- a/decoder/LAVVideo/LAVVideo.vcxproj +++ b/decoder/LAVVideo/LAVVideo.vcxproj @@ -114,6 +114,7 @@ <ClCompile Include="parsers\HEVCSequenceParser.cpp" /> <ClCompile Include="parsers\MPEG2HeaderParser.cpp" /> <ClCompile Include="parsers\VC1HeaderParser.cpp" /> + <ClCompile Include="pixconv\convert_direct.cpp" /> <ClCompile Include="pixconv\convert_generic.cpp" /> <ClCompile Include="pixconv\interleave.cpp" /> <ClCompile Include="pixconv\pixconv.cpp" /> diff --git a/decoder/LAVVideo/LAVVideo.vcxproj.filters b/decoder/LAVVideo/LAVVideo.vcxproj.filters index aef02e38..4b128c93 100644 --- a/decoder/LAVVideo/LAVVideo.vcxproj.filters +++ b/decoder/LAVVideo/LAVVideo.vcxproj.filters @@ -159,6 +159,9 @@ <ClCompile Include="parsers\HEVCSequenceParser.cpp"> <Filter>Source Files\parsers</Filter> </ClCompile> + <ClCompile Include="pixconv\convert_direct.cpp"> + <Filter>Source Files\pixconv</Filter> + </ClCompile> </ItemGroup> <ItemGroup> <ClInclude Include="stdafx.h"> diff --git a/decoder/LAVVideo/decoders/DecBase.h b/decoder/LAVVideo/decoders/DecBase.h index f6604998..752b66da 100644 --- a/decoder/LAVVideo/decoders/DecBase.h +++ b/decoder/LAVVideo/decoders/DecBase.h @@ -43,6 +43,8 @@ public: STDMETHODIMP SyncToProcessThread() { return HasThreadSafeBuffers() == S_OK ? S_FALSE : S_OK; } STDMETHODIMP HasThreadSafeBuffers() { return S_FALSE; } + STDMETHODIMP SetDirectOutput(BOOL bDirect) { return S_FALSE; } + STDMETHODIMP Decode(IMediaSample *pSample) { HRESULT hr; diff --git a/decoder/LAVVideo/decoders/ILAVDecoder.h b/decoder/LAVVideo/decoders/ILAVDecoder.h index 87a6d7c1..93660cc3 100644 --- a/decoder/LAVVideo/decoders/ILAVDecoder.h +++ b/decoder/LAVVideo/decoders/ILAVDecoder.h @@ -77,6 +77,11 @@ LAVPixFmtDesc getPixelFormatDesc(LAVPixelFormat pixFmt); */ AVPixelFormat getFFPixelFormatFromLAV(LAVPixelFormat pixFmt, int bpp); +typedef struct LAVDirectBuffer { + BYTE *data[4]; ///< pointer to the picture planes + ptrdiff_t stride[4]; ///< stride of the planes (in bytes) +} LAVDirectBuffer; + /** * A Video Frame * @@ -122,6 +127,10 @@ typedef struct LAVFrame { /* destruct function to free any buffers being held by this frame (may be null) */ void (*destruct)(struct LAVFrame *); void *priv_data; ///< private data from the decoder (mostly for destruct) + + bool direct; + bool (*direct_lock)(struct LAVFrame *, struct LAVDirectBuffer *); + void (*direct_unlock)(struct LAVFrame *); } LAVFrame; /** @@ -371,6 +380,11 @@ interface ILAVDecoder * Get whether the decoder should sync to the main thread */ STDMETHOD(SyncToProcessThread)() PURE; + + /** + * Toggle direct frame output mode for hardware decoders + */ + STDMETHOD(SetDirectOutput)(BOOL bDirect) PURE; }; /** diff --git a/decoder/LAVVideo/decoders/dxva2dec.cpp b/decoder/LAVVideo/decoders/dxva2dec.cpp index 01982df2..7443d787 100644 --- a/decoder/LAVVideo/decoders/dxva2dec.cpp +++ b/decoder/LAVVideo/decoders/dxva2dec.cpp @@ -1180,8 +1180,7 @@ HRESULT CDecDXVA2::ReInitDXVA2Decoder(AVCodecContext *c) } hr = m_pDXVA2Allocator->Commit(); } else if (!m_bNative) { - if (SyncToProcessThread() == S_FALSE) - FlushDisplayQueue(TRUE); + FlushDisplayQueue(TRUE); hr = CreateDXVA2Decoder(); } } @@ -1414,10 +1413,14 @@ HRESULT CDecDXVA2::DeliverDXVA2Frame(LAVFrame *pFrame) pFrame->format = LAVPixFmt_DXVA2; Deliver(pFrame); } else { - if (CopyFrame(pFrame)) - Deliver(pFrame); - else - ReleaseFrame(&pFrame); + if (m_bDirect) { + DeliverDirect(pFrame); + } else { + if (CopyFrame(pFrame)) + Deliver(pFrame); + else + ReleaseFrame(&pFrame); + } } return S_OK; @@ -1470,3 +1473,51 @@ __forceinline bool CDecDXVA2::CopyFrame(LAVFrame *pFrame) return true; } + + +static bool direct_lock(LAVFrame * pFrame, LAVDirectBuffer *pBuffer) +{ + ASSERT(pFrame && pBuffer); + + HRESULT hr; + LPDIRECT3DSURFACE9 pSurface = (LPDIRECT3DSURFACE9)pFrame->data[3]; + + D3DSURFACE_DESC surfaceDesc; + pSurface->GetDesc(&surfaceDesc); + + D3DLOCKED_RECT LockedRect; + hr = pSurface->LockRect(&LockedRect, nullptr, D3DLOCK_READONLY); + if (FAILED(hr)) { + DbgLog((LOG_TRACE, 10, L"pSurface->LockRect failed (hr: %X)", hr)); + return false; + } + + memset(pBuffer, 0, sizeof(*pBuffer)); + + pBuffer->data[0] = (BYTE *)LockedRect.pBits; + pBuffer->data[1] = pBuffer->data[0] + surfaceDesc.Height * LockedRect.Pitch; + + pBuffer->stride[0] = LockedRect.Pitch; + pBuffer->stride[1] = LockedRect.Pitch; + return true; +} + +static void direct_unlock(LAVFrame * pFrame) +{ + ASSERT(pFrame); + LPDIRECT3DSURFACE9 pSurface = (LPDIRECT3DSURFACE9)pFrame->data[3]; + pSurface->UnlockRect(); +} + + +bool CDecDXVA2::DeliverDirect(LAVFrame *pFrame) +{ + pFrame->format = LAVPixFmt_NV12; + pFrame->direct = true; + pFrame->direct_lock = direct_lock; + pFrame->direct_unlock = direct_unlock; + + Deliver(pFrame); + + return true; +} diff --git a/decoder/LAVVideo/decoders/dxva2dec.h b/decoder/LAVVideo/decoders/dxva2dec.h index 7f259728..1bc77e8e 100644 --- a/decoder/LAVVideo/decoders/dxva2dec.h +++ b/decoder/LAVVideo/decoders/dxva2dec.h @@ -52,9 +52,10 @@ public: STDMETHODIMP InitAllocator(IMemAllocator **ppAlloc); STDMETHODIMP PostConnect(IPin *pPin); STDMETHODIMP_(long) GetBufferCount(); - STDMETHODIMP_(const WCHAR*) GetDecoderName() { return m_bNative ? L"dxva2n" : L"dxva2cb"; } + STDMETHODIMP_(const WCHAR*) GetDecoderName() { return m_bNative ? L"dxva2n" : (m_bDirect ? L"dxva2cb direct" : L"dxva2cb"); } STDMETHODIMP HasThreadSafeBuffers() { return m_bNative ? S_FALSE : S_OK; } - STDMETHODIMP SyncToProcessThread() { return HasThreadSafeBuffers() == S_OK ? S_FALSE : S_OK; } + STDMETHODIMP SyncToProcessThread() { return S_OK; } + STDMETHODIMP SetDirectOutput(BOOL bDirect) { m_bDirect = bDirect; return S_OK; } // CDecBase STDMETHODIMP Init(); @@ -68,6 +69,7 @@ protected: HRESULT DeliverDXVA2Frame(LAVFrame *pFrame); bool CopyFrame(LAVFrame *pFrame); + bool DeliverDirect(LAVFrame *pFrame); private: HRESULT InitD3D(); @@ -101,6 +103,7 @@ private: private: friend class CDXVA2SurfaceAllocator; BOOL m_bNative = FALSE; + BOOL m_bDirect = FALSE; CDXVA2SurfaceAllocator *m_pDXVA2Allocator = nullptr; struct { diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp new file mode 100644 index 00000000..f61dc38a --- /dev/null +++ b/decoder/LAVVideo/pixconv/convert_direct.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2010-2015 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "stdafx.h" + +#include <emmintrin.h> + +#include "pixconv_internal.h" +#include "pixconv_sse2_templates.h" + +// This function is only designed for NV12-like pixel formats, like NV12, P010, P016, ... +DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4) +{ + const ptrdiff_t inStride = srcStride[0]; + const ptrdiff_t outStride = dstStride[0]; + const ptrdiff_t chromaHeight = (height >> 1); + + const ptrdiff_t byteWidth = (outputFormat == LAVOutPixFmt_P010 || outputFormat == LAVOutPixFmt_P016) ? width << 1 : width; + const ptrdiff_t stride = min(FFALIGN(byteWidth, 64), min(inStride, outStride)); + + __m128i xmm0,xmm1,xmm2,xmm3; + + _mm_sfence(); + + ptrdiff_t line, i; + + for (line = 0; line < height; line++) { + const uint8_t *y = (src[0] + line * inStride); + uint8_t *dy = (dst[0] + line * outStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, y + i + 0); + PIXCONV_STREAM_LOAD(xmm1, y + i + 16); + PIXCONV_STREAM_LOAD(xmm2, y + i + 32); + PIXCONV_STREAM_LOAD(xmm3, y + i + 48); + + _ReadWriteBarrier(); + + PIXCONV_PUT_STREAM(dy + i + 0, xmm0); + PIXCONV_PUT_STREAM(dy + i + 16, xmm1); + PIXCONV_PUT_STREAM(dy + i + 32, xmm2); + PIXCONV_PUT_STREAM(dy + i + 48, xmm3); + } + + for (; i < byteWidth; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, y + i); + PIXCONV_PUT_STREAM(dy + i, xmm0); + } + } + + for (line = 0; line < chromaHeight; line++) { + const uint8_t *uv = (src[1] + line * inStride); + uint8_t *duv = (dst[1] + line * outStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, uv + i + 0); + PIXCONV_STREAM_LOAD(xmm1, uv + i + 16); + PIXCONV_STREAM_LOAD(xmm2, uv + i + 32); + PIXCONV_STREAM_LOAD(xmm3, uv + i + 48); + + _ReadWriteBarrier(); + + PIXCONV_PUT_STREAM(duv + i + 0, xmm0); + PIXCONV_PUT_STREAM(duv + i + 16, xmm1); + PIXCONV_PUT_STREAM(duv + i + 32, xmm2); + PIXCONV_PUT_STREAM(duv + i + 48, xmm3); + } + + for (; i < byteWidth; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, uv + i); + PIXCONV_PUT_STREAM(duv + i, xmm0); + } + } + + return S_OK; +} diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index 6b913ad8..602b6905 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -97,6 +97,12 @@ #define PIXCONV_LOAD_ALIGNED(reg,src) \ reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ +// Load 128-bit into a register, using streaming memory access +// reg - register to store pixels in +// src - memory pointer of the source +#define PIXCONV_STREAM_LOAD(reg,src) \ + reg = _mm_stream_load_si128((__m128i *)(src)); /* load (streaming) */ + #define PIXCONV_LOAD_PIXEL8_ALIGNED PIXCONV_LOAD_ALIGNED // Put 128-bit into memory, using streaming write |