Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/LAVFilters.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Leppkes <h.leppkes@gmail.com>2015-01-29 16:29:41 +0300
committerHendrik Leppkes <h.leppkes@gmail.com>2015-01-31 17:48:15 +0300
commit4e576410b52577ff7d3274a94ae955fbb1c7e443 (patch)
treef30d4d6056ee64b815bcd9ec99b2122a933cf463 /decoder
parentf2f72474d685df1c293996550d3ec2bbb7f38e2f (diff)
dxva2cb: implement direct output mode
This reduces the performance overhead and CPU usage by up to 50%
Diffstat (limited to 'decoder')
-rw-r--r--decoder/LAVVideo/DecodeThread.h1
-rw-r--r--decoder/LAVVideo/Filtering.cpp3
-rw-r--r--decoder/LAVVideo/LAVPixFmtConverter.cpp55
-rw-r--r--decoder/LAVVideo/LAVPixFmtConverter.h8
-rw-r--r--decoder/LAVVideo/LAVVideo.cpp78
-rw-r--r--decoder/LAVVideo/LAVVideo.h3
-rw-r--r--decoder/LAVVideo/LAVVideo.vcxproj1
-rw-r--r--decoder/LAVVideo/LAVVideo.vcxproj.filters3
-rw-r--r--decoder/LAVVideo/decoders/DecBase.h2
-rw-r--r--decoder/LAVVideo/decoders/ILAVDecoder.h14
-rw-r--r--decoder/LAVVideo/decoders/dxva2dec.cpp63
-rw-r--r--decoder/LAVVideo/decoders/dxva2dec.h7
-rw-r--r--decoder/LAVVideo/pixconv/convert_direct.cpp90
-rw-r--r--decoder/LAVVideo/pixconv/pixconv_sse2_templates.h6
14 files changed, 324 insertions, 10 deletions
diff --git a/decoder/LAVVideo/DecodeThread.h b/decoder/LAVVideo/DecodeThread.h
index 0d80d752..580880bd 100644
--- a/decoder/LAVVideo/DecodeThread.h
+++ b/decoder/LAVVideo/DecodeThread.h
@@ -37,6 +37,7 @@ public:
STDMETHODIMP GetPixelFormat(LAVPixelFormat *pPix, int *pBpp) { ASSERT(m_pDecoder); return m_pDecoder->GetPixelFormat(pPix, pBpp); }
STDMETHODIMP_(REFERENCE_TIME) GetFrameDuration() { ASSERT(m_pDecoder); return m_pDecoder->GetFrameDuration(); }
STDMETHODIMP HasThreadSafeBuffers() { return m_pDecoder ? m_pDecoder->HasThreadSafeBuffers() : S_FALSE; }
+ STDMETHODIMP SetDirectOutput(BOOL bDirect) { return m_pDecoder ? m_pDecoder->SetDirectOutput(bDirect) : S_FALSE; }
STDMETHODIMP CreateDecoder(const CMediaType *pmt, AVCodecID codec);
diff --git a/decoder/LAVVideo/Filtering.cpp b/decoder/LAVVideo/Filtering.cpp
index c07ec49e..05187e23 100644
--- a/decoder/LAVVideo/Filtering.cpp
+++ b/decoder/LAVVideo/Filtering.cpp
@@ -128,6 +128,9 @@ HRESULT CLAVVideo::Filter(LAVFrame *pFrame)
if (!m_pFilterGraph)
goto deliver;
+ if (pFrame->direct)
+ DeDirectFrame(pFrame, true);
+
AVFrame *in_frame = nullptr;
BOOL refcountedFrame = (m_Decoder.HasThreadSafeBuffers() == S_OK);
// When flushing, we feed a NULL frame
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp
index d07de862..a1e18387 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.cpp
+++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp
@@ -139,6 +139,7 @@ static LAV_INOUT_PIXFMT_MAP *lookupFormatMap(LAVPixelFormat informat, int bpp, B
CLAVPixFmtConverter::CLAVPixFmtConverter()
{
convert = &CLAVPixFmtConverter::convert_generic;
+ convert_direct = nullptr;
m_NumThreads = min(8, max(1, av_cpu_count() / 2));
@@ -398,6 +399,27 @@ void CLAVPixFmtConverter::SelectConvertFunction()
if (convert == nullptr) {
convert = &CLAVPixFmtConverter::convert_generic;
}
+
+ SelectConvertFunctionDirect();
+}
+
+void CLAVPixFmtConverter::SelectConvertFunctionDirect()
+{
+ convert_direct = nullptr;
+ m_bDirectMode = FALSE;
+
+ int cpu = av_get_cpu_flags();
+ if (m_InputPixFmt == LAVPixFmt_NV12 && m_OutputPixFmt == LAVOutPixFmt_NV12) {
+ if (cpu & AV_CPU_FLAG_SSE4)
+ convert_direct = &CLAVPixFmtConverter::plane_copy_direct_sse4;
+ else if (cpu & AV_CPU_FLAG_SSE2)
+ convert_direct = &CLAVPixFmtConverter::plane_copy_sse2;
+ else
+ convert_direct = &CLAVPixFmtConverter::plane_copy;
+ }
+
+ if (convert_direct != nullptr)
+ m_bDirectMode = true;
}
HRESULT CLAVPixFmtConverter::Convert(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight) {
@@ -439,6 +461,39 @@ HRESULT CLAVPixFmtConverter::Convert(LAVFrame *pFrame, uint8_t *dst, int width,
return hr;
}
+BOOL CLAVPixFmtConverter::IsDirectModeSupported(uintptr_t dst, ptrdiff_t stride) {
+ if (FFALIGN(stride, 16) != stride || (dst % 16u))
+ return false;
+ return m_bDirectMode;
+}
+
+HRESULT CLAVPixFmtConverter::ConvertDirect(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight)
+{
+ HRESULT hr = S_OK;
+ planeHeight = max(height, planeHeight);
+ ASSERT(pFrame->direct && pFrame->direct_lock && pFrame->direct_unlock);
+
+ LAVDirectBuffer buffer;
+ if (pFrame->direct_lock(pFrame, &buffer)) {
+ uint8_t *dstArray[4] = { 0 };
+ ptrdiff_t dstStrideArray[4] = { 0 };
+ ptrdiff_t byteStride = dstStride * lav_pixfmt_desc[m_OutputPixFmt].codedbytes;
+
+ dstArray[0] = dst;
+ dstStrideArray[0] = byteStride;
+
+ for (int i = 1; i < lav_pixfmt_desc[m_OutputPixFmt].planes; ++i) {
+ dstArray[i] = dstArray[i - 1] + dstStrideArray[i - 1] * (planeHeight / lav_pixfmt_desc[m_OutputPixFmt].planeHeight[i - 1]);
+ dstStrideArray[i] = byteStride / lav_pixfmt_desc[m_OutputPixFmt].planeWidth[i];
+ }
+
+ hr = (this->*convert_direct)(buffer.data, buffer.stride, dstArray, dstStrideArray, width, height, m_InputPixFmt, m_InBpp, m_OutputPixFmt);
+ pFrame->direct_unlock(pFrame);
+ }
+
+ return hr;
+}
+
void CLAVPixFmtConverter::ChangeStride(const uint8_t* src, ptrdiff_t srcStride, uint8_t *dst, ptrdiff_t dstStride, int width, int height, int planeHeight, LAVOutPixFmts format)
{
LAVOutPixFmtDesc desc = lav_pixfmt_desc[format];
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h
index f50a8655..3ab7d9b0 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.h
+++ b/decoder/LAVVideo/LAVPixFmtConverter.h
@@ -76,8 +76,10 @@ public:
BOOL IsAllowedSubtype(const GUID *guid);
HRESULT Convert(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight);
+ HRESULT ConvertDirect(LAVFrame *pFrame, uint8_t *dst, int width, int height, ptrdiff_t dstStride, int planeHeight);
BOOL IsRGBConverterActive() { return m_bRGBConverter; }
+ BOOL IsDirectModeSupported(uintptr_t dst, ptrdiff_t stride);
DWORD GetImageSize(int width, int height, LAVOutPixFmts pixFmt = LAVOutPixFmt_None);
@@ -90,6 +92,7 @@ private:
LAVOutPixFmts GetFilteredFormat(int index);
void SelectConvertFunction();
+ void SelectConvertFunctionDirect();
// Helper functions for convert_generic
HRESULT swscale_scale(enum AVPixelFormat srcPix, enum AVPixelFormat dstPix, const uint8_t* const src[4], const ptrdiff_t srcStride[4], uint8_t* dst[4], int width, int height, const ptrdiff_t dstStride[4], LAVOutPixFmtDesc pixFmtDesc, bool swapPlanes12 = false);
@@ -110,6 +113,7 @@ private:
// Conversion function pointer
ConverterFn convert;
+ ConverterFn convert_direct;
// Pixel Implementations
DECLARE_CONV_FUNC(convert_generic);
@@ -130,6 +134,8 @@ private:
DECLARE_CONV_FUNC(convert_rgb48_rgb32_ssse3);
template <int out32> DECLARE_CONV_FUNC(convert_rgb48_rgb);
+ DECLARE_CONV_FUNC(plane_copy_direct_sse4);
+
DECLARE_CONV_FUNC(convert_yuv_rgb);
const RGBCoeffs* getRGBCoeffs(int width, int height);
void InitRGBConvDispatcher();
@@ -141,6 +147,8 @@ private:
LAVOutPixFmts m_OutputPixFmt = LAVOutPixFmt_YV12;
int m_InBpp = 0;
+ BOOL m_bDirectMode = false;
+
int swsWidth = 0;
int swsHeight = 0;
int swsOutputRange = 0;
diff --git a/decoder/LAVVideo/LAVVideo.cpp b/decoder/LAVVideo/LAVVideo.cpp
index fb318cd8..31434e7c 100644
--- a/decoder/LAVVideo/LAVVideo.cpp
+++ b/decoder/LAVVideo/LAVVideo.cpp
@@ -675,6 +675,25 @@ done:
return SUCCEEDED(hr) ? S_OK : VFW_E_TYPE_NOT_ACCEPTED;
}
+HRESULT CLAVVideo::CheckDirectMode()
+{
+ LAVPixelFormat pix;
+ int bpp;
+ m_Decoder.GetPixelFormat(&pix, &bpp);
+
+ BOOL bDirect = (pix == LAVPixFmt_NV12);
+ if (m_Decoder.IsInterlaced() && m_settings.SWDeintMode == SWDeintMode_YADIF)
+ bDirect = FALSE;
+ else if (m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_NV12)
+ bDirect = FALSE;
+ else if (m_SubtitleConsumer && m_SubtitleConsumer->HasProvider())
+ bDirect = FALSE;
+
+ m_Decoder.SetDirectOutput(bDirect);
+
+ return S_OK;
+}
+
HRESULT CLAVVideo::SetMediaType(PIN_DIRECTION dir, const CMediaType *pmt)
{
HRESULT hr = S_OK;
@@ -799,6 +818,9 @@ HRESULT CLAVVideo::CompleteConnect(PIN_DIRECTION dir, IPin *pReceivePin)
HRESULT hr = S_OK;
if (dir == PINDIR_OUTPUT) {
hr = m_Decoder.PostConnect(pReceivePin);
+ if (SUCCEEDED(hr)) {
+ CheckDirectMode();
+ }
} else if (dir == PINDIR_INPUT) {
if (m_pInput->CurrentMediaType().subtype == MEDIASUBTYPE_MPEG2_VIDEO && !m_pSubtitleInput && (m_dwDecodeFlags & LAV_VIDEO_DEC_FLAG_DVD)) {
m_pSubtitleInput = new CLAVVideoSubtitleInputPin(TEXT("CLAVVideoSubtitleInputPin"), this, &m_csFilter, &hr, L"Subtitle Input");
@@ -1277,6 +1299,46 @@ STDMETHODIMP CLAVVideo::ReleaseFrame(LAVFrame **ppFrame)
return S_OK;
}
+HRESULT CLAVVideo::DeDirectFrame(LAVFrame *pFrame, bool bDisableDirectMode)
+{
+ if (!pFrame->direct)
+ return S_FALSE;
+
+ ASSERT(pFrame->direct_lock && pFrame->direct_unlock);
+
+ LAVPixFmtDesc desc = getPixelFormatDesc(pFrame->format);
+
+ LAVFrame tmpFrame = *pFrame;
+ pFrame->destruct = nullptr;
+ pFrame->priv_data = nullptr;
+ pFrame->direct = false;
+ pFrame->direct_lock = nullptr;
+ pFrame->direct_unlock = nullptr;
+ memset(pFrame->data, 0, sizeof(pFrame->data));
+
+ LAVDirectBuffer buffer;
+ if (tmpFrame.direct_lock(&tmpFrame, &buffer)) {
+ AllocLAVFrameBuffers(pFrame, buffer.stride[0] / desc.codedbytes);
+
+ // use slow copy, this should only be used extremely rarely
+ memcpy(pFrame->data[0], buffer.data[0], pFrame->height * buffer.stride[0]);
+ for (int i = 1; i < desc.planes; i++)
+ memcpy(pFrame->data[i], buffer.data[i], (pFrame->height / desc.planeHeight[i]) * buffer.stride[i]);
+
+ tmpFrame.direct_unlock(&tmpFrame);
+ } else {
+ // fallack, alloc anyway so nothing blows up
+ AllocLAVFrameBuffers(pFrame);
+ }
+
+ FreeLAVFrameBuffers(&tmpFrame);
+
+ if (bDisableDirectMode)
+ m_Decoder.SetDirectOutput(false);
+
+ return S_OK;
+}
+
STDMETHODIMP_(LAVFrame*) CLAVVideo::GetFlushFrame()
{
LAVFrame *pFlushFrame = nullptr;
@@ -1367,6 +1429,7 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame)
if (pFrame->format != LAVPixFmt_DXVA2) {
ReleaseFrame(&m_pLastSequenceFrame);
if ((pFrame->flags & LAV_FRAME_FLAG_END_OF_SEQUENCE || m_bInDVDMenu)) {
+ if (pFrame->direct) DeDirectFrame(pFrame, false);
CopyLAVFrame(pFrame, &m_pLastSequenceFrame);
}
} else {
@@ -1446,8 +1509,10 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame)
if (m_SubtitleConsumer && m_SubtitleConsumer->HasProvider()) {
m_SubtitleConsumer->SetVideoSize(width, height);
m_SubtitleConsumer->RequestFrame(pFrame->rtStart, pFrame->rtStop);
- if (!bRGBOut)
+ if (!bRGBOut) {
+ if (pFrame->direct) DeDirectFrame(pFrame, true);
m_SubtitleConsumer->ProcessFrame(pFrame);
+ }
}
// Grab a media sample, and start assembling the data for it.
@@ -1494,7 +1559,16 @@ HRESULT CLAVVideo::DeliverToRenderer(LAVFrame *pFrame)
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&start);
#endif
- m_PixFmtConverter.Convert(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight));
+
+ if (pFrame->direct && !m_PixFmtConverter.IsDirectModeSupported((uintptr_t)pDataOut, pBIH->biWidth)) {
+ DeDirectFrame(pFrame, true);
+ }
+
+ if (pFrame->direct)
+ m_PixFmtConverter.ConvertDirect(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight));
+ else
+ m_PixFmtConverter.Convert(pFrame, pDataOut, width, height, pBIH->biWidth, abs(pBIH->biHeight));
+
#if defined(DEBUG) && DEBUG_PIXELCONV_TIMINGS
QueryPerformanceCounter(&end);
double diff = (end.QuadPart - start.QuadPart) * 1000.0 / frequency.QuadPart;
diff --git a/decoder/LAVVideo/LAVVideo.h b/decoder/LAVVideo/LAVVideo.h
index 83579bd8..7fd834c9 100644
--- a/decoder/LAVVideo/LAVVideo.h
+++ b/decoder/LAVVideo/LAVVideo.h
@@ -192,6 +192,9 @@ private:
HRESULT NegotiatePixelFormat(CMediaType &mt, int width, int height);
BOOL IsInterlaced();
+ HRESULT CheckDirectMode();
+ HRESULT DeDirectFrame(LAVFrame *pFrame, bool bDisableDirectMode = true);
+
HRESULT Filter(LAVFrame *pFrame);
HRESULT DeliverToRenderer(LAVFrame *pFrame);
diff --git a/decoder/LAVVideo/LAVVideo.vcxproj b/decoder/LAVVideo/LAVVideo.vcxproj
index 6c2ed16c..f0b98e4c 100644
--- a/decoder/LAVVideo/LAVVideo.vcxproj
+++ b/decoder/LAVVideo/LAVVideo.vcxproj
@@ -114,6 +114,7 @@
<ClCompile Include="parsers\HEVCSequenceParser.cpp" />
<ClCompile Include="parsers\MPEG2HeaderParser.cpp" />
<ClCompile Include="parsers\VC1HeaderParser.cpp" />
+ <ClCompile Include="pixconv\convert_direct.cpp" />
<ClCompile Include="pixconv\convert_generic.cpp" />
<ClCompile Include="pixconv\interleave.cpp" />
<ClCompile Include="pixconv\pixconv.cpp" />
diff --git a/decoder/LAVVideo/LAVVideo.vcxproj.filters b/decoder/LAVVideo/LAVVideo.vcxproj.filters
index aef02e38..4b128c93 100644
--- a/decoder/LAVVideo/LAVVideo.vcxproj.filters
+++ b/decoder/LAVVideo/LAVVideo.vcxproj.filters
@@ -159,6 +159,9 @@
<ClCompile Include="parsers\HEVCSequenceParser.cpp">
<Filter>Source Files\parsers</Filter>
</ClCompile>
+ <ClCompile Include="pixconv\convert_direct.cpp">
+ <Filter>Source Files\pixconv</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="stdafx.h">
diff --git a/decoder/LAVVideo/decoders/DecBase.h b/decoder/LAVVideo/decoders/DecBase.h
index f6604998..752b66da 100644
--- a/decoder/LAVVideo/decoders/DecBase.h
+++ b/decoder/LAVVideo/decoders/DecBase.h
@@ -43,6 +43,8 @@ public:
STDMETHODIMP SyncToProcessThread() { return HasThreadSafeBuffers() == S_OK ? S_FALSE : S_OK; }
STDMETHODIMP HasThreadSafeBuffers() { return S_FALSE; }
+ STDMETHODIMP SetDirectOutput(BOOL bDirect) { return S_FALSE; }
+
STDMETHODIMP Decode(IMediaSample *pSample) {
HRESULT hr;
diff --git a/decoder/LAVVideo/decoders/ILAVDecoder.h b/decoder/LAVVideo/decoders/ILAVDecoder.h
index 87a6d7c1..93660cc3 100644
--- a/decoder/LAVVideo/decoders/ILAVDecoder.h
+++ b/decoder/LAVVideo/decoders/ILAVDecoder.h
@@ -77,6 +77,11 @@ LAVPixFmtDesc getPixelFormatDesc(LAVPixelFormat pixFmt);
*/
AVPixelFormat getFFPixelFormatFromLAV(LAVPixelFormat pixFmt, int bpp);
+typedef struct LAVDirectBuffer {
+ BYTE *data[4]; ///< pointer to the picture planes
+ ptrdiff_t stride[4]; ///< stride of the planes (in bytes)
+} LAVDirectBuffer;
+
/**
* A Video Frame
*
@@ -122,6 +127,10 @@ typedef struct LAVFrame {
/* destruct function to free any buffers being held by this frame (may be null) */
void (*destruct)(struct LAVFrame *);
void *priv_data; ///< private data from the decoder (mostly for destruct)
+
+ bool direct;
+ bool (*direct_lock)(struct LAVFrame *, struct LAVDirectBuffer *);
+ void (*direct_unlock)(struct LAVFrame *);
} LAVFrame;
/**
@@ -371,6 +380,11 @@ interface ILAVDecoder
* Get whether the decoder should sync to the main thread
*/
STDMETHOD(SyncToProcessThread)() PURE;
+
+ /**
+ * Toggle direct frame output mode for hardware decoders
+ */
+ STDMETHOD(SetDirectOutput)(BOOL bDirect) PURE;
};
/**
diff --git a/decoder/LAVVideo/decoders/dxva2dec.cpp b/decoder/LAVVideo/decoders/dxva2dec.cpp
index 01982df2..7443d787 100644
--- a/decoder/LAVVideo/decoders/dxva2dec.cpp
+++ b/decoder/LAVVideo/decoders/dxva2dec.cpp
@@ -1180,8 +1180,7 @@ HRESULT CDecDXVA2::ReInitDXVA2Decoder(AVCodecContext *c)
}
hr = m_pDXVA2Allocator->Commit();
} else if (!m_bNative) {
- if (SyncToProcessThread() == S_FALSE)
- FlushDisplayQueue(TRUE);
+ FlushDisplayQueue(TRUE);
hr = CreateDXVA2Decoder();
}
}
@@ -1414,10 +1413,14 @@ HRESULT CDecDXVA2::DeliverDXVA2Frame(LAVFrame *pFrame)
pFrame->format = LAVPixFmt_DXVA2;
Deliver(pFrame);
} else {
- if (CopyFrame(pFrame))
- Deliver(pFrame);
- else
- ReleaseFrame(&pFrame);
+ if (m_bDirect) {
+ DeliverDirect(pFrame);
+ } else {
+ if (CopyFrame(pFrame))
+ Deliver(pFrame);
+ else
+ ReleaseFrame(&pFrame);
+ }
}
return S_OK;
@@ -1470,3 +1473,51 @@ __forceinline bool CDecDXVA2::CopyFrame(LAVFrame *pFrame)
return true;
}
+
+
+static bool direct_lock(LAVFrame * pFrame, LAVDirectBuffer *pBuffer)
+{
+ ASSERT(pFrame && pBuffer);
+
+ HRESULT hr;
+ LPDIRECT3DSURFACE9 pSurface = (LPDIRECT3DSURFACE9)pFrame->data[3];
+
+ D3DSURFACE_DESC surfaceDesc;
+ pSurface->GetDesc(&surfaceDesc);
+
+ D3DLOCKED_RECT LockedRect;
+ hr = pSurface->LockRect(&LockedRect, nullptr, D3DLOCK_READONLY);
+ if (FAILED(hr)) {
+ DbgLog((LOG_TRACE, 10, L"pSurface->LockRect failed (hr: %X)", hr));
+ return false;
+ }
+
+ memset(pBuffer, 0, sizeof(*pBuffer));
+
+ pBuffer->data[0] = (BYTE *)LockedRect.pBits;
+ pBuffer->data[1] = pBuffer->data[0] + surfaceDesc.Height * LockedRect.Pitch;
+
+ pBuffer->stride[0] = LockedRect.Pitch;
+ pBuffer->stride[1] = LockedRect.Pitch;
+ return true;
+}
+
+static void direct_unlock(LAVFrame * pFrame)
+{
+ ASSERT(pFrame);
+ LPDIRECT3DSURFACE9 pSurface = (LPDIRECT3DSURFACE9)pFrame->data[3];
+ pSurface->UnlockRect();
+}
+
+
+bool CDecDXVA2::DeliverDirect(LAVFrame *pFrame)
+{
+ pFrame->format = LAVPixFmt_NV12;
+ pFrame->direct = true;
+ pFrame->direct_lock = direct_lock;
+ pFrame->direct_unlock = direct_unlock;
+
+ Deliver(pFrame);
+
+ return true;
+}
diff --git a/decoder/LAVVideo/decoders/dxva2dec.h b/decoder/LAVVideo/decoders/dxva2dec.h
index 7f259728..1bc77e8e 100644
--- a/decoder/LAVVideo/decoders/dxva2dec.h
+++ b/decoder/LAVVideo/decoders/dxva2dec.h
@@ -52,9 +52,10 @@ public:
STDMETHODIMP InitAllocator(IMemAllocator **ppAlloc);
STDMETHODIMP PostConnect(IPin *pPin);
STDMETHODIMP_(long) GetBufferCount();
- STDMETHODIMP_(const WCHAR*) GetDecoderName() { return m_bNative ? L"dxva2n" : L"dxva2cb"; }
+ STDMETHODIMP_(const WCHAR*) GetDecoderName() { return m_bNative ? L"dxva2n" : (m_bDirect ? L"dxva2cb direct" : L"dxva2cb"); }
STDMETHODIMP HasThreadSafeBuffers() { return m_bNative ? S_FALSE : S_OK; }
- STDMETHODIMP SyncToProcessThread() { return HasThreadSafeBuffers() == S_OK ? S_FALSE : S_OK; }
+ STDMETHODIMP SyncToProcessThread() { return S_OK; }
+ STDMETHODIMP SetDirectOutput(BOOL bDirect) { m_bDirect = bDirect; return S_OK; }
// CDecBase
STDMETHODIMP Init();
@@ -68,6 +69,7 @@ protected:
HRESULT DeliverDXVA2Frame(LAVFrame *pFrame);
bool CopyFrame(LAVFrame *pFrame);
+ bool DeliverDirect(LAVFrame *pFrame);
private:
HRESULT InitD3D();
@@ -101,6 +103,7 @@ private:
private:
friend class CDXVA2SurfaceAllocator;
BOOL m_bNative = FALSE;
+ BOOL m_bDirect = FALSE;
CDXVA2SurfaceAllocator *m_pDXVA2Allocator = nullptr;
struct {
diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp
new file mode 100644
index 00000000..f61dc38a
--- /dev/null
+++ b/decoder/LAVVideo/pixconv/convert_direct.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2010-2015 Hendrik Leppkes
+ * http://www.1f0.de
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "stdafx.h"
+
+#include <emmintrin.h>
+
+#include "pixconv_internal.h"
+#include "pixconv_sse2_templates.h"
+
+// This function is only designed for NV12-like pixel formats, like NV12, P010, P016, ...
+DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4)
+{
+ const ptrdiff_t inStride = srcStride[0];
+ const ptrdiff_t outStride = dstStride[0];
+ const ptrdiff_t chromaHeight = (height >> 1);
+
+ const ptrdiff_t byteWidth = (outputFormat == LAVOutPixFmt_P010 || outputFormat == LAVOutPixFmt_P016) ? width << 1 : width;
+ const ptrdiff_t stride = min(FFALIGN(byteWidth, 64), min(inStride, outStride));
+
+ __m128i xmm0,xmm1,xmm2,xmm3;
+
+ _mm_sfence();
+
+ ptrdiff_t line, i;
+
+ for (line = 0; line < height; line++) {
+ const uint8_t *y = (src[0] + line * inStride);
+ uint8_t *dy = (dst[0] + line * outStride);
+ for (i = 0; i < (stride - 63); i += 64) {
+ PIXCONV_STREAM_LOAD(xmm0, y + i + 0);
+ PIXCONV_STREAM_LOAD(xmm1, y + i + 16);
+ PIXCONV_STREAM_LOAD(xmm2, y + i + 32);
+ PIXCONV_STREAM_LOAD(xmm3, y + i + 48);
+
+ _ReadWriteBarrier();
+
+ PIXCONV_PUT_STREAM(dy + i + 0, xmm0);
+ PIXCONV_PUT_STREAM(dy + i + 16, xmm1);
+ PIXCONV_PUT_STREAM(dy + i + 32, xmm2);
+ PIXCONV_PUT_STREAM(dy + i + 48, xmm3);
+ }
+
+ for (; i < byteWidth; i += 16) {
+ PIXCONV_LOAD_ALIGNED(xmm0, y + i);
+ PIXCONV_PUT_STREAM(dy + i, xmm0);
+ }
+ }
+
+ for (line = 0; line < chromaHeight; line++) {
+ const uint8_t *uv = (src[1] + line * inStride);
+ uint8_t *duv = (dst[1] + line * outStride);
+ for (i = 0; i < (stride - 63); i += 64) {
+ PIXCONV_STREAM_LOAD(xmm0, uv + i + 0);
+ PIXCONV_STREAM_LOAD(xmm1, uv + i + 16);
+ PIXCONV_STREAM_LOAD(xmm2, uv + i + 32);
+ PIXCONV_STREAM_LOAD(xmm3, uv + i + 48);
+
+ _ReadWriteBarrier();
+
+ PIXCONV_PUT_STREAM(duv + i + 0, xmm0);
+ PIXCONV_PUT_STREAM(duv + i + 16, xmm1);
+ PIXCONV_PUT_STREAM(duv + i + 32, xmm2);
+ PIXCONV_PUT_STREAM(duv + i + 48, xmm3);
+ }
+
+ for (; i < byteWidth; i += 16) {
+ PIXCONV_LOAD_ALIGNED(xmm0, uv + i);
+ PIXCONV_PUT_STREAM(duv + i, xmm0);
+ }
+ }
+
+ return S_OK;
+}
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index 6b913ad8..602b6905 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -97,6 +97,12 @@
#define PIXCONV_LOAD_ALIGNED(reg,src) \
reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */
+// Load 128-bit into a register, using streaming memory access
+// reg - register to store pixels in
+// src - memory pointer of the source
+#define PIXCONV_STREAM_LOAD(reg,src) \
+ reg = _mm_stream_load_si128((__m128i *)(src)); /* load (streaming) */
+
#define PIXCONV_LOAD_PIXEL8_ALIGNED PIXCONV_LOAD_ALIGNED
// Put 128-bit into memory, using streaming write