From 37f62abd654047d060c86d6c76cd2f6862f89b94 Mon Sep 17 00:00:00 2001 From: kinddragon Date: Fri, 21 May 2010 00:53:52 +0000 Subject: DSUtil now use new VirtualDub libraries (SSE2 deinterlacing for MPEG2 decoder) AudioSwitcher rare memory corruption fixed git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1907 10f7b99b-c216-0410-bff0-8a66a9350fd8 --- src/DSUtil/deinterlace.cpp | 526 ++++++++++++++++++++++++ src/DSUtil/dsutil.vcproj | 132 +----- src/DSUtil/vd.cpp | 970 +++++++++++---------------------------------- src/DSUtil/vd.h | 18 +- src/DSUtil/vd_asm.cpp | 290 +------------- src/DSUtil/vd_asm.h | 11 +- 6 files changed, 777 insertions(+), 1170 deletions(-) create mode 100644 src/DSUtil/deinterlace.cpp (limited to 'src/DSUtil') diff --git a/src/DSUtil/deinterlace.cpp b/src/DSUtil/deinterlace.cpp new file mode 100644 index 000000000..a66915dfd --- /dev/null +++ b/src/DSUtil/deinterlace.cpp @@ -0,0 +1,526 @@ +// VirtualDub - Video processing and capture application +// Copyright (C) 1998-2001 Avery Lee +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +#include "stdafx.h" +#include +#include + +#define uint8 unsigned char +#define uint32 unsigned int +#define uint64 unsigned __int64 + +#ifdef _M_IX86 +#define VD_CPU_X86 +#endif + +#ifdef _M_X64 +#define VD_CPU_AMD64 +#endif + +/////////////////////////////////////////////////////////////////////////// + +#pragma warning(disable: 4799) // warning C4799: function has no EMMS instruction + +/////////////////////////////////////////////////////////////////////////// + +#ifdef _M_IX86 +static void __declspec(naked) asm_blend_row_clipped(void *dst, const void *src, uint32 w, ptrdiff_t srcpitch) { + __asm { + push ebp + push edi + push esi + push ebx + + mov edi,[esp+20] + mov esi,[esp+24] + sub edi,esi + mov ebp,[esp+28] + mov edx,[esp+32] + +xloop: + mov ecx,[esi] + mov eax,0fefefefeh + + mov ebx,[esi+edx] + and eax,ecx + + shr eax,1 + and ebx,0fefefefeh + + shr ebx,1 + add esi,4 + + add eax,ebx + dec ebp + + mov [edi+esi-4],eax + jnz xloop + + pop ebx + pop esi + pop edi + pop ebp + ret + }; +} + +static void __declspec(naked) asm_blend_row(void *dst, const void *src, uint32 w, ptrdiff_t srcpitch) { + __asm { + push ebp + push edi + push esi + push ebx + + mov edi,[esp+20] + mov esi,[esp+24] + sub edi,esi + mov ebp,[esp+28] + mov edx,[esp+32] + +xloop: + mov ecx,[esi] + mov eax,0fcfcfcfch + + mov ebx,[esi+edx] + and eax,ecx + + shr ebx,1 + mov ecx,[esi+edx*2] + + shr ecx,2 + and ebx,07f7f7f7fh + + shr eax,2 + and ecx,03f3f3f3fh + + add eax,ebx + add esi,4 + + add eax,ecx + dec ebp + + mov [edi+esi-4],eax + jnz xloop + + pop ebx + pop esi + pop edi + pop ebp + ret + }; +} + +static void __declspec(naked) asm_blend_row_MMX(void *dst, const void *src, uint32 w, ptrdiff_t srcpitch) { + static const __declspec(align(8)) __int64 mask0 = 0xfcfcfcfcfcfcfcfci64; + static const __declspec(align(8)) __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64; + static const __declspec(align(8)) __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64; + __asm { + push ebp + push edi + push esi + push ebx + + mov edi,[esp+20] + mov esi,[esp+24] + sub edi,esi + mov ebp,[esp+28] + mov edx,[esp+32] + + movq mm5,mask0 + movq mm6,mask1 + movq mm7,mask2 + inc ebp + shr ebp,1 +xloop: + movq mm2,[esi] + movq mm0,mm5 + + movq mm1,[esi+edx] + pand mm0,mm2 + + psrlq mm1,1 + movq mm2,[esi+edx*2] + + psrlq mm2,2 + pand mm1,mm6 + + psrlq mm0,2 + pand mm2,mm7 + + paddb mm0,mm1 + add esi,8 + + paddb mm0,mm2 + dec ebp + + movq [edi+esi-8],mm0 + jne xloop + + pop ebx + pop esi + pop edi + pop ebp + ret + }; +} + +static void __declspec(naked) asm_blend_row_ISSE(void *dst, const void *src, uint32 w, ptrdiff_t srcpitch) { + __asm { + push ebp + push edi + push esi + push ebx + + mov edi,[esp+20] + mov esi,[esp+24] + sub edi,esi + mov ebp,[esp+28] + mov edx,[esp+32] + + inc ebp + shr ebp,1 + pcmpeqb mm7, mm7 + + align 16 +xloop: + movq mm0, [esi] + movq mm2, mm7 + pxor mm0, mm7 + + pxor mm2, [esi+edx*2] + pavgb mm0, mm2 + pxor mm0, mm7 + + pavgb mm0, [esi+edx] + add esi,8 + + movq [edi+esi-8],mm0 + dec ebp + jne xloop + + pop ebx + pop esi + pop edi + pop ebp + ret + }; +} +#else +static void asm_blend_row_clipped(void *dst0, const void *src0, uint32 w, ptrdiff_t srcpitch) { + uint32 *dst = (uint32 *)dst0; + const uint32 *src = (const uint32 *)src0; + const uint32 *src2 = (const uint32 *)((const char *)src + srcpitch); + + do { + const uint32 x = *src++; + const uint32 y = *src2++; + + *dst++ = (x|y) - (((x^y)&0xfefefefe)>>1); + } while(--w); +} + +static void asm_blend_row(void *dst0, const void *src0, uint32 w, ptrdiff_t srcpitch) { + uint32 *dst = (uint32 *)dst0; + const uint32 *src = (const uint32 *)src0; + const uint32 *src2 = (const uint32 *)((const char *)src + srcpitch); + const uint32 *src3 = (const uint32 *)((const char *)src2 + srcpitch); + + do { + const uint32 a = *src++; + const uint32 b = *src2++; + const uint32 c = *src3++; + const uint32 hi = (a & 0xfcfcfc) + 2*(b & 0xfcfcfc) + (c & 0xfcfcfc); + const uint32 lo = (a & 0x030303) + 2*(b & 0x030303) + (c & 0x030303) + 0x020202; + + *dst++ = (hi + (lo & 0x0c0c0c))>>2; + } while(--w); +} +#endif + +#if defined(VD_CPU_X86) || defined(VD_CPU_AMD64) + static void asm_blend_row_SSE2(void *dst, const void *src, uint32 w, ptrdiff_t srcpitch) { + __m128i zero = _mm_setzero_si128(); + __m128i inv = _mm_cmpeq_epi8(zero, zero); + + w = (w + 3) >> 2; + + const __m128i *src1 = (const __m128i *)src; + const __m128i *src2 = (const __m128i *)((const char *)src + srcpitch); + const __m128i *src3 = (const __m128i *)((const char *)src + srcpitch*2); + __m128i *dstrow = (__m128i *)dst; + do { + __m128i a = *src1++; + __m128i b = *src2++; + __m128i c = *src3++; + + *dstrow++ = _mm_avg_epu8(_mm_xor_si128(_mm_avg_epu8(_mm_xor_si128(a, inv), _mm_xor_si128(c, inv)), inv), b); + } while(--w); + } + +#endif + +namespace { + + void Average_scalar(void *dst, ptrdiff_t dstPitch, const void *src1, const void *src2, ptrdiff_t srcPitch, uint32 w16, uint32 h) { + uint32 w4 = w16 << 2; + do { + uint32 *dstv = (uint32 *)dst; + uint32 *src1v = (uint32 *)src1; + uint32 *src2v = (uint32 *)src2; + + for(uint32 i=0; i> 1); + } + + dst = (char *)dst + dstPitch; + src1 = (char *)src1 + srcPitch; + src2 = (char *)src2 + srcPitch; + } while(--h); + } + +#if defined(VD_CPU_X86) + void __declspec(naked) __cdecl Average_MMX(void *dst, ptrdiff_t dstPitch, const void *src1, const void *src2, ptrdiff_t srcPitch, uint32 w16, uint32 h) { + static const __declspec(align(8)) uint64 x7fb = 0x7f7f7f7f7f7f7f7f; + static const __declspec(align(8)) uint64 xfeb = 0xfefefefefefefefe; + + __asm { + push ebp + push edi + push esi + push ebx + + mov esi, [esp+24+16] + mov eax, [esp+4+16] + shl esi, 4 + mov ecx, [esp+12+16] + mov edx, [esp+16+16] + mov ebp, [esp+20+16] + mov edi, [esp+8+16] + sub edi, esi + sub ebp, esi + + movq mm6, x7fb + movq mm7, xfeb + + mov esi, [esp+28+16] +yloop: + mov ebx, [esp+24+16] +mainRowLoop: + movq mm0, [ecx] + movq mm3, [ecx + 8] + movq mm1, mm0 + movq mm2, [edx] + movq mm4, mm3 + movq mm5, [edx + 8] + por mm1, mm2 + pxor mm0, mm2 + por mm4, mm5 + pxor mm3, mm5 + psrlq mm0, 1 + pand mm3, mm7 + pand mm0, mm6 + psrlq mm3, 1 + psubb mm1, mm0 + psubb mm4, mm3 + add ecx, 16 + movq [eax], mm1 + movq [eax+8], mm4 + add edx, 16 + add eax, 16 + dec ebx + jne mainRowLoop + + add eax, edi + add ecx, ebp + add edx, ebp + dec esi + jne yloop + + emms + pop ebx + pop esi + pop edi + pop ebp + ret + } + } + + void __declspec(naked) __cdecl Average_ISSE(void *dst, ptrdiff_t dstPitch, const void *src1, const void *src2, ptrdiff_t srcPitch, uint32 w16, uint32 h) { + static const __declspec(align(8)) uint64 x7fb = 0x7f7f7f7f7f7f7f7f; + static const __declspec(align(8)) uint64 xfeb = 0xfefefefefefefefe; + + __asm { + push ebp + push edi + push esi + push ebx + + mov esi, [esp+24+16] + mov eax, [esp+4+16] + shl esi, 4 + mov ecx, [esp+12+16] + mov edx, [esp+16+16] + mov ebp, [esp+20+16] + mov edi, [esp+8+16] + sub edi, esi + sub ebp, esi + + movq mm6, x7fb + movq mm7, xfeb + + mov esi, [esp+28+16] +yloop: + mov ebx, [esp+24+16] +mainRowLoop: + movq mm0, [ecx] + movq mm1, [ecx + 8] + movq mm2, [edx] + movq mm3, [edx + 8] + pavgb mm0, mm2 + pavgb mm1, mm3 + movq [eax], mm0 + add ecx, 16 + add edx, 16 + movq [eax+8], mm1 + add eax, 16 + dec ebx + jne mainRowLoop + + add eax, edi + add ecx, ebp + add edx, ebp + dec esi + jne yloop + + emms + pop ebx + pop esi + pop edi + pop ebp + ret + } + } +#endif + +#if defined(VD_CPU_X86) || defined(VD_CPU_AMD64) + void Average_SSE2(void *dst, ptrdiff_t dstPitch, const void *src1, const void *src2, ptrdiff_t srcPitch, uint32 w16, uint32 h) { + do { + __m128i *dstv = (__m128i *)dst; + __m128i *src1v = (__m128i *)src1; + __m128i *src2v = (__m128i *)src2; + + for(uint32 i=0; i> 2; + + int y0 = interpField2 ? 1 : 2; + + if (!interpField2) + memcpy(dst, src, w * 4); + + if (h > y0) { + ASSERT(((UINT_PTR)dst & 0xF) == 0); + ASSERT((dstpitch & 0xF) == 0); + ASSERT(((UINT_PTR)src & 0xF) == 0); + ASSERT((srcpitch*(y0 - 1) & 0xF) == 0); + blend_func((char *)dst + dstpitch*y0, + dstpitch*2, + (const char *)src + srcpitch*(y0 - 1), + (const char *)src + srcpitch*(y0 + 1), + srcpitch*2, + (w + 3) >> 2, + (h - y0) >> 1); + } + + if (interpField2) + memcpy((char *)dst + dstpitch*(h - 1), (const char *)src + srcpitch*(h - 1), w*4); + +#ifdef _M_IX86 + if (MMX_enabled) + __asm emms +#endif + } + + void BlendPlane(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, uint32 w, uint32 h) { + void (*blend_func)(void *, const void *, uint32, ptrdiff_t); +#if defined(VD_CPU_X86) + if (SSE2_enabled) + blend_func = asm_blend_row_SSE2; + else + blend_func = ISSE_enabled ? asm_blend_row_ISSE : MMX_enabled ? asm_blend_row_MMX : asm_blend_row; +#else + blend_func = asm_blend_row_SSE2; +#endif + + w = (w + 3) >> 2; + + asm_blend_row_clipped(dst, src, w, srcpitch); + if (h-=2) + do { + dst = ((char *)dst + dstpitch); + + blend_func(dst, src, w, srcpitch); + + src = ((char *)src + srcpitch); + } while(--h); + + asm_blend_row_clipped((char *)dst + dstpitch, src, w, srcpitch); + +#ifdef _M_IX86 + if (MMX_enabled) + __asm emms +#endif + } +} + +void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch) +{ + BlendPlane(dst, dstpitch, src, srcpitch, w, h); +} + +void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield) +{ + topfield = !topfield; + + InterpPlane_Bob(dst, dstpitch, src, srcpitch, w, h, topfield); +} diff --git a/src/DSUtil/dsutil.vcproj b/src/DSUtil/dsutil.vcproj index 49162423b..69a625c18 100644 --- a/src/DSUtil/dsutil.vcproj +++ b/src/DSUtil/dsutil.vcproj @@ -44,7 +44,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +#include +#include + +#include +#include +#include + #pragma warning(disable : 4799) // no emms... blahblahblah +void VDCPUTest() { + SYSTEM_INFO si; + + long lEnableFlags = CPUCheckForExtensions(); + + GetSystemInfo(&si); + + if (si.wProcessorArchitecture == PROCESSOR_ARCHITECTURE_INTEL) + if (si.wProcessorLevel < 4) + lEnableFlags &= ~CPUF_SUPPORTS_FPU; // Not strictly true, but very slow anyway + + // Enable FPU support... + + CPUEnableExtensions(lEnableFlags); + + VDFastMemcpyAutodetect(); +} + CCpuID g_cpuid; CCpuID::CCpuID() -{ - int CPUInfo[4] = {-1}; - __cpuid(CPUInfo, 1); - int t = CPUInfo[3]; - - int mflags = 0; - mflags |= ((t&0x00800000)!=0) ? mmx : 0; // STD MMX - mflags |= ((t&0x02000000)!=0) ? ssemmx+ssefpu : 0; // STD SSE - mflags |= ((t&0x04000000)!=0) ? sse2 : 0; // SSE2 +{ + VDCPUTest(); - t = CPUInfo[2]; - mflags |= ((t&0x00000001)!=0) ? sse3 : 0; // SSE3 + long lEnableFlags = CPUGetEnabledExtensions(); - // 3dnow - __cpuid(CPUInfo, 0x80000001); - t = CPUInfo[3]; - mflags |= ((t&0x80000000)!=0) ? _3dnow : 0; // 3D NOW - mflags |= ((t&0x00400000)!=0) ? ssemmx : 0; // SSE MMX + int flags = 0; + flags |= !!(lEnableFlags & CPUF_SUPPORTS_MMX) ? mmx : 0; // STD MMX + flags |= !!(lEnableFlags & CPUF_SUPPORTS_INTEGER_SSE) ? ssemmx : 0; // SSE MMX + flags |= !!(lEnableFlags & CPUF_SUPPORTS_SSE) ? ssefpu : 0; // STD SSE + flags |= !!(lEnableFlags & CPUF_SUPPORTS_SSE2) ? sse2 : 0; // SSE2 + flags |= !!(lEnableFlags & CPUF_SUPPORTS_3DNOW) ? _3dnow : 0; // 3DNow // result - m_flags = (flag_t)mflags; + m_flags = (flag_t)flags; } -void memcpy_accel(void* dst, const void* src, size_t len) +bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { -#ifndef _WIN64 - if((g_cpuid.m_flags & CCpuID::ssefpu) && len >= 128 - && !((DWORD)src&15) && !((DWORD)dst&15)) - { - __asm - { - mov esi, dword ptr [src] - mov edi, dword ptr [dst] - mov ecx, len - shr ecx, 7 - memcpy_accel_sse_loop: - prefetchnta [esi+16*8] - movaps xmm0, [esi] - movaps xmm1, [esi+16*1] - movaps xmm2, [esi+16*2] - movaps xmm3, [esi+16*3] - movaps xmm4, [esi+16*4] - movaps xmm5, [esi+16*5] - movaps xmm6, [esi+16*6] - movaps xmm7, [esi+16*7] - movntps [edi], xmm0 - movntps [edi+16*1], xmm1 - movntps [edi+16*2], xmm2 - movntps [edi+16*3], xmm3 - movntps [edi+16*4], xmm4 - movntps [edi+16*5], xmm5 - movntps [edi+16*6], xmm6 - movntps [edi+16*7], xmm7 - add esi, 128 - add edi, 128 - dec ecx - jne memcpy_accel_sse_loop - mov ecx, len - and ecx, 127 - cmp ecx, 0 - je memcpy_accel_sse_end - memcpy_accel_sse_loop2: - mov dl, byte ptr[esi] - mov byte ptr[edi], dl - inc esi - inc edi - dec ecx - jne memcpy_accel_sse_loop2 - memcpy_accel_sse_end: - emms - sfence - } - } - else if((g_cpuid.m_flags & CCpuID::mmx) && len >= 64 - && !((DWORD)src&7) && !((DWORD)dst&7)) - { - __asm - { - mov esi, dword ptr [src] - mov edi, dword ptr [dst] - mov ecx, len - shr ecx, 6 - memcpy_accel_mmx_loop: - movq mm0, qword ptr [esi] - movq mm1, qword ptr [esi+8*1] - movq mm2, qword ptr [esi+8*2] - movq mm3, qword ptr [esi+8*3] - movq mm4, qword ptr [esi+8*4] - movq mm5, qword ptr [esi+8*5] - movq mm6, qword ptr [esi+8*6] - movq mm7, qword ptr [esi+8*7] - movq qword ptr [edi], mm0 - movq qword ptr [edi+8*1], mm1 - movq qword ptr [edi+8*2], mm2 - movq qword ptr [edi+8*3], mm3 - movq qword ptr [edi+8*4], mm4 - movq qword ptr [edi+8*5], mm5 - movq qword ptr [edi+8*6], mm6 - movq qword ptr [edi+8*7], mm7 - add esi, 64 - add edi, 64 - loop memcpy_accel_mmx_loop - mov ecx, len - and ecx, 63 - cmp ecx, 0 - je memcpy_accel_mmx_end - memcpy_accel_mmx_loop2: - mov dl, byte ptr [esi] - mov byte ptr [edi], dl - inc esi - inc edi - dec ecx - jne memcpy_accel_mmx_loop2 - memcpy_accel_mmx_end: - emms - } - } - else -#endif - { - memcpy(dst, src, len); - } + VDPixmap srcbm = {0}; + + srcbm.data = srcy; + srcbm.pitch = srcpitch; + srcbm.w = w; + srcbm.h = h; + srcbm.format = nsVDPixmap::kPixFormat_YUV420_Planar; + srcbm.data2 = srcu; + srcbm.pitch2 = srcpitch / 2; + srcbm.data3 = srcv; + srcbm.pitch3 = srcpitch / 2; + + VDPixmap dstpxm = {0}; + + dstpxm.data = dsty; + dstpxm.pitch = dstpitch; + dstpxm.w = w; + dstpxm.h = h; + dstpxm.format = nsVDPixmap::kPixFormat_YUV420_Planar; + dstpxm.data2 = dstu; + dstpxm.pitch2 = dstpitch / 2; + dstpxm.data3 = dstv; + dstpxm.pitch3 = dstpitch / 2; + + return VDPixmapBlt(dstpxm, srcbm); } -static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) +bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch) { - WORD* dstw = (WORD*)dst; - for(; width > 1; width -= 2) - { - *dstw++ = (*srcu++<<8)|*srcy++; - *dstw++ = (*srcv++<<8)|*srcy++; - } + VDPixmap srcbm = {0}; + + srcbm.data = src; + srcbm.pitch = srcpitch; + srcbm.w = w; + srcbm.h = h; + srcbm.format = nsVDPixmap::kPixFormat_YUV422_YUYV; + + VDPixmap dstpxm = { + dst, + NULL, + w, + h, + dstpitch + }; + + dstpxm.format = nsVDPixmap::kPixFormat_YUV422_YUYV; + + return VDPixmapBlt(dstpxm, srcbm); } -static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) +bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { - WORD* dstw = (WORD*)dst; - for(; width > 1; width -= 2, srcu++, srcv++) - { - *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++; - *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++; - } + VDPixmap srcbm = {0}; + + srcbm.data = srcy; + srcbm.pitch = srcpitch; + srcbm.w = w; + srcbm.h = h; + srcbm.format = nsVDPixmap::kPixFormat_YUV420_Planar; + srcbm.data2 = srcu; + srcbm.pitch2 = srcpitch/2; + srcbm.data3 = srcv; + srcbm.pitch3 = srcpitch/2; + + VDPixmap dstpxm = { + (char *)dst + dstpitch * (h - 1), + NULL, + w, + h, + -dstpitch + }; + + switch(dbpp) { + case 16: dstpxm.format = nsVDPixmap::kPixFormat_XRGB1555; break; + case 24: dstpxm.format = nsVDPixmap::kPixFormat_RGB888; break; + case 32: dstpxm.format = nsVDPixmap::kPixFormat_XRGB8888; break; + default: + VDASSERT(false); + } + + // TODO: check correct conversion work (555->565) when dpp == 16 + + return VDPixmapBlt(dstpxm, srcbm); } -static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) +bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { - BYTE* src2 = src + srcpitch; - do + if(srcpitch == 0) srcpitch = w; + +#ifndef _WIN64 + if((g_cpuid.m_flags & CCpuID::sse2) + && !((DWORD_PTR)srcy&15) && !((DWORD_PTR)srcu&15) && !((DWORD_PTR)srcv&15) && !(srcpitch&31) + && !((DWORD_PTR)dst&15) && !(dstpitch&15)) { - *dst++ = (*src++ + *src2++ + 1) >> 1; - } while(w--); + if(w<=0 || h<=0 || (w&1) || (h&1)) + return(false); + + yv12_yuy2_sse2(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); + return(true); + } +#endif + + VDPixmap srcbm = {0}; + + srcbm.data = srcy; + srcbm.pitch = srcpitch; + srcbm.w = w; + srcbm.h = h; + srcbm.format = nsVDPixmap::kPixFormat_YUV420_Planar; + srcbm.data2 = srcu; + srcbm.pitch2 = srcpitch/2; + srcbm.data3 = srcv; + srcbm.pitch3 = srcpitch/2; + + VDPixmap dstpxm = { + dst, + NULL, + w, + h, + dstpitch + }; + + dstpxm.format = nsVDPixmap::kPixFormat_YUV422_YUYV; + + return VDPixmapBlt(dstpxm, srcbm); } -static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) +bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp) { - BYTE* src2 = src + srcpitch; - BYTE* src3 = src2 + srcpitch; - do - { - *dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2; - } while(w--); + VDPixmap srcbm = { + (char *)src + srcpitch * (h - 1), + NULL, + w, + h, + -srcpitch + }; + + switch(dbpp) { + case 8: srcbm.format = nsVDPixmap::kPixFormat_Pal8; break; + case 16: srcbm.format = nsVDPixmap::kPixFormat_XRGB1555; break; + case 24: srcbm.format = nsVDPixmap::kPixFormat_RGB888; break; + case 32: srcbm.format = nsVDPixmap::kPixFormat_XRGB8888; break; + default: + VDASSERT(false); + } + + VDPixmap dstpxm = { + (char *)dst + dstpitch * (h - 1), + NULL, + w, + h, + -dstpitch + }; + + switch(dbpp) { + case 8: dstpxm.format = nsVDPixmap::kPixFormat_Pal8; break; + case 16: dstpxm.format = nsVDPixmap::kPixFormat_XRGB1555; break; + case 24: dstpxm.format = nsVDPixmap::kPixFormat_RGB888; break; + case 32: dstpxm.format = nsVDPixmap::kPixFormat_XRGB8888; break; + default: + VDASSERT(false); + } + + return VDPixmapBlt(dstpxm, srcbm); } -bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) +bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch) { - if((w&1)) - return(false); - - if(w > 0 && w == srcpitch && w == dstpitch) - { - memcpy_accel(dsty, srcy, h*srcpitch); - memcpy_accel(dstu, srcu, h/2*srcpitch/2); - memcpy_accel(dstv, srcv, h/2*srcpitch/2); - } - else - { - int pitch = min(abs(srcpitch), abs(dstpitch)); - - for(ptrdiff_t y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch) - memcpy_accel(dsty, srcy, pitch); + if(srcpitch == 0) srcpitch = w; - srcpitch >>= 1; - dstpitch >>= 1; + VDPixmap srcbm = {0}; - pitch = min(abs(srcpitch), abs(dstpitch)); + srcbm.data = src; + srcbm.pitch = srcpitch; + srcbm.w = w; + srcbm.h = h; + srcbm.format = nsVDPixmap::kPixFormat_YUV422_YUYV; - for(ptrdiff_t y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch) - memcpy_accel(dstu, srcu, pitch); + VDPixmap dstpxm = { + (char *)dst + dstpitch * (h - 1), + NULL, + w, + h, + -dstpitch + }; - for(ptrdiff_t y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch) - memcpy_accel(dstv, srcv, pitch); + switch(dbpp) { + case 16: dstpxm.format = nsVDPixmap::kPixFormat_XRGB1555; break; + case 24: dstpxm.format = nsVDPixmap::kPixFormat_RGB888; break; + case 32: dstpxm.format = nsVDPixmap::kPixFormat_XRGB8888; break; + default: + VDASSERT(false); } - return(true); + return VDPixmapBlt(dstpxm, srcbm); } -bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch) +static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) { - if(w > 0 && w == srcpitch && w == dstpitch) - { - memcpy_accel(dst, src, h*srcpitch); - } - else + WORD* dstw = (WORD*)dst; + for(; width > 1; width -= 2) { - int pitch = min(abs(srcpitch), abs(dstpitch)); - - for(ptrdiff_t y = 0; y < h; y++, src += srcpitch, dst += dstpitch) - memcpy_accel(dst, src, pitch); + *dstw++ = (*srcu++<<8)|*srcy++; + *dstw++ = (*srcv++<<8)|*srcy++; } - - return(true); } -#ifndef _WIN64 -extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); -#endif - -bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) +static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) { - if(w<=0 || h<=0 || (w&1) || (h&1)) - return(false); - -#ifndef _WIN64 - void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;; - - if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7)) - { - switch(dbpp) - { - case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_ISSE*/; break; // TODO: fix _ISSE (555->565) - case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break; - case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break; - } - } - else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7)) - { - switch(dbpp) - { - case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_MMX*/; break; // TODO: fix _MMX (555->565) - case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break; - case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break; - } - } - else - { - switch(dbpp) - { - case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break; - case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break; - case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break; - } - } - - if(!asm_YUVtoRGB_row) - return(false); - - do + WORD* dstw = (WORD*)dst; + for(; width > 1; width -= 2, srcu++, srcv++) { - asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2); - - dst += 2*dstpitch; - srcy += srcpitch*2; - srcu += srcpitch/2; - srcv += srcpitch/2; + *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++; + *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++; } - while(h -= 2); - - if(g_cpuid.m_flags & CCpuID::mmx) - __asm emms - - if(g_cpuid.m_flags & CCpuID::ssefpu) - __asm sfence - - return(true); -#else - ASSERT(FALSE); - return(false); -#endif } -bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced) +bool BitBltFromI420ToYUY2Interlaced(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { if(w<=0 || h<=0 || (w&1) || (h&1)) return(false); @@ -332,16 +304,15 @@ bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYT #ifndef _WIN64 if((g_cpuid.m_flags & CCpuID::sse2) - && !((DWORD_PTR)srcy&15) && !((DWORD_PTR)srcu&15) && !((DWORD_PTR)srcv&15) && !(srcpitch&31) - && !((DWORD_PTR)dst&15) && !(dstpitch&15)) + && !((DWORD_PTR)srcy&15) && !((DWORD_PTR)srcu&15) && !((DWORD_PTR)srcv&15) && !(srcpitch&31) + && !((DWORD_PTR)dst&15) && !(dstpitch&15)) { - if(!fInterlaced) yv12_yuy2_sse2(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); - else yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); + yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); return(true); } else { - ASSERT(!fInterlaced); + ASSERT(FALSE); } if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7)) @@ -359,15 +330,16 @@ bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYT if(!yuvtoyuy2row) return(false); + int halfsrcpitch = srcpitch/2; do { yuvtoyuy2row(dst, srcy, srcu, srcv, w); - yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2); + yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, halfsrcpitch); dst += 2*dstpitch; - srcy += srcpitch*2; - srcu += srcpitch/2; - srcv += srcpitch/2; + srcy += halfsrcpitch; + srcu += halfsrcpitch; + srcv += halfsrcpitch; } while((h -= 2) > 2); @@ -381,481 +353,3 @@ bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYT return(true); } - -bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp) -{ - if(dbpp == sbpp) - { - int rowbytes = w*dbpp>>3; - - if(rowbytes > 0 && rowbytes == srcpitch && rowbytes == dstpitch) - { - memcpy_accel(dst, src, h*rowbytes); - } - else - { - for(ptrdiff_t y = 0; y < h; y++, src += srcpitch, dst += dstpitch) - memcpy_accel(dst, src, rowbytes); - } - - return(true); - } - - if(sbpp != 16 && sbpp != 24 && sbpp != 32 - || dbpp != 16 && dbpp != 24 && dbpp != 32) - return(false); - - if(dbpp == 16) - { - for(ptrdiff_t y = 0; y < h; y++, src += srcpitch, dst += dstpitch) - { - if(sbpp == 24) - { - BYTE* s = (BYTE*)src; - WORD* d = (WORD*)dst; - for(ptrdiff_t x = 0; x < w; x++, s+=3, d++) - *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f)); - } - else if(sbpp == 32) - { - DWORD* s = (DWORD*)src; - WORD* d = (WORD*)dst; - for(ptrdiff_t x = 0; x < w; x++, s++, d++) - *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f)); - } - } - } - else if(dbpp == 24) - { - for(ptrdiff_t y = 0; y < h; y++, src += srcpitch, dst += dstpitch) - { - if(sbpp == 16) - { - WORD* s = (WORD*)src; - BYTE* d = (BYTE*)dst; - for(ptrdiff_t x = 0; x < w; x++, s++, d+=3) - { // not tested, r-g-b might be in reverse - d[0] = (*s&0x001f)<<3; - d[1] = (*s&0x07e0)<<5; - d[2] = (*s&0xf800)<<8; - } - } - else if(sbpp == 32) - { - BYTE* s = (BYTE*)src; - BYTE* d = (BYTE*)dst; - for(ptrdiff_t x = 0; x < w; x++, s+=4, d+=3) - {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];} - } - } - } - else if(dbpp == 32) - { - for(ptrdiff_t y = 0; y < h; y++, src += srcpitch, dst += dstpitch) - { - if(sbpp == 16) - { - WORD* s = (WORD*)src; - DWORD* d = (DWORD*)dst; - for(ptrdiff_t x = 0; x < w; x++, s++, d++) - *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3); - } - else if(sbpp == 24) - { - BYTE* s = (BYTE*)src; - DWORD* d = (DWORD*)dst; - for(ptrdiff_t x = 0; x < w; x++, s+=3, d++) - *d = *((DWORD*)s)&0xffffff; - } - } - } - - return(true); -} - -void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch) -{ - void (*blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL; - void (*blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL; - -#ifndef _WIN64 - if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf)) - { - blend_row_clipped = asm_blend_row_clipped_SSE2; - blend_row = asm_blend_row_SSE2; - } - else if(g_cpuid.m_flags & CCpuID::mmx) - { - blend_row_clipped = asm_blend_row_clipped_MMX; - blend_row = asm_blend_row_MMX; - } - else -#endif - { - blend_row_clipped = asm_blend_row_clipped_c; - blend_row = asm_blend_row_c; - } - - if(!blend_row_clipped) - return; - - blend_row_clipped(dst, src, rowbytes, srcpitch); - - if((h -= 2) > 0) do - { - dst += dstpitch; - blend_row(dst, src, rowbytes, srcpitch); - src += srcpitch; - } - while(--h); - - blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch); - -#ifndef _WIN64 - if(g_cpuid.m_flags & CCpuID::mmx) - __asm emms -#endif -} - -void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield) -{ - if(topfield) - { - BitBltFromRGBToRGB(rowbytes, h/2, dst, dstpitch*2, 8, src, srcpitch*2, 8); - AvgLines8(dst, h, dstpitch); - } - else - { - BitBltFromRGBToRGB(rowbytes, h/2, dst + dstpitch, dstpitch*2, 8, src + srcpitch, srcpitch*2, 8); - AvgLines8(dst + dstpitch, h-1, dstpitch); - } -} - -void AvgLines8(BYTE* dst, DWORD h, DWORD pitch) -{ - if(h <= 1) - return; - - BYTE* s = dst; - BYTE* d = dst + (h-2)*pitch; - - for(; s < d; s += pitch*2) - { - BYTE* tmp = s; - -#ifndef _WIN64 - if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf)) - { - __asm - { - mov esi, tmp - mov ebx, pitch - - mov ecx, ebx - shr ecx, 4 - -AvgLines8_sse2_loop: - movdqa xmm0, [esi] - pavgb xmm0, [esi+ebx*2] - movdqa [esi+ebx], xmm0 - add esi, 16 - - dec ecx - jnz AvgLines8_sse2_loop - - mov tmp, esi - } - - for(ptrdiff_t i = pitch&7; i--; tmp++) - { - tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; - } - } - else if(g_cpuid.m_flags & CCpuID::mmx) - { - __asm - { - mov esi, tmp - mov ebx, pitch - - mov ecx, ebx - shr ecx, 3 - - pxor mm7, mm7 -AvgLines8_mmx_loop: - movq mm0, [esi] - movq mm1, mm0 - - punpcklbw mm0, mm7 - punpckhbw mm1, mm7 - - movq mm2, [esi+ebx*2] - movq mm3, mm2 - - punpcklbw mm2, mm7 - punpckhbw mm3, mm7 - - paddw mm0, mm2 - psrlw mm0, 1 - - paddw mm1, mm3 - psrlw mm1, 1 - - packuswb mm0, mm1 - - movq [esi+ebx], mm0 - - lea esi, [esi+8] - - dec ecx - jnz AvgLines8_mmx_loop - - mov tmp, esi - } - - for(ptrdiff_t i = pitch&7; i--; tmp++) - { - tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; - } - } - else -#endif - { - for(ptrdiff_t i = pitch; i--; tmp++) - { - tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; - } - } - } - - if(!(h&1) && h >= 2) - { - dst += (h-2)*pitch; - memcpy_accel(dst + pitch, dst, pitch); - } - -#ifndef _WIN64 - __asm emms; -#endif -} - -void AvgLines555(BYTE* dst, DWORD h, DWORD pitch) -{ - if(h <= 1) - return; - - unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0; - unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; - - BYTE* s = dst; - BYTE* d = dst + (h-2)*pitch; - - for(; s < d; s += pitch*2) - { - BYTE* tmp = s; - -#ifndef _WIN64 - __asm - { - mov esi, tmp - mov ebx, pitch - - mov ecx, ebx - shr ecx, 3 - - movq mm6, __0x03e003e003e003e0 - movq mm7, __0x001f001f001f001f - -AvgLines555_loop: - movq mm0, [esi] - movq mm1, mm0 - movq mm2, mm0 - - psrlw mm0, 10 // red1 bits: mm0 = 001f001f001f001f - pand mm1, mm6 // green1 bits: mm1 = 03e003e003e003e0 - pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f - - movq mm3, [esi+ebx*2] - movq mm4, mm3 - movq mm5, mm3 - - psrlw mm3, 10 // red2 bits: mm3 = 001f001f001f001f - pand mm4, mm6 // green2 bits: mm4 = 03e003e003e003e0 - pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f - - paddw mm0, mm3 - psrlw mm0, 1 // (red1+red2)/2 - psllw mm0, 10 // red bits at 7c007c007c007c00 - - paddw mm1, mm4 - psrlw mm1, 1 // (green1+green2)/2 - pand mm1, mm6 // green bits at 03e003e003e003e0 - - paddw mm2, mm5 - psrlw mm2, 1 // (blue1+blue2)/2 - // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) - - por mm0, mm1 - por mm0, mm2 - - movq [esi+ebx], mm0 - - lea esi, [esi+8] - - dec ecx - jnz AvgLines555_loop - - mov tmp, esi - } -#endif - - for(ptrdiff_t i = (pitch&7)>>1; i--; tmp++) - { - tmp[pitch] = - ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)| - ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)| - ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); - } - } - - if(!(h&1) && h >= 2) - { - dst += (h-2)*pitch; - memcpy_accel(dst + pitch, dst, pitch); - } - -#ifndef _WIN64 - __asm emms; -#endif -} - -void AvgLines565(BYTE* dst, DWORD h, DWORD pitch) -{ - if(h <= 1) - return; - - unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0; - unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; - - BYTE* s = dst; - BYTE* d = dst + (h-2)*pitch; - - for(; s < d; s += pitch*2) - { - WORD* tmp = (WORD*)s; - -#ifndef _WIN64 - __asm - { - mov esi, tmp - mov ebx, pitch - - mov ecx, ebx - shr ecx, 3 - - movq mm6, __0x07e007e007e007e0 - movq mm7, __0x001f001f001f001f - -AvgLines565_loop: - movq mm0, [esi] - movq mm1, mm0 - movq mm2, mm0 - - psrlw mm0, 11 // red1 bits: mm0 = 001f001f001f001f - pand mm1, mm6 // green1 bits: mm1 = 07e007e007e007e0 - pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f - - movq mm3, [esi+ebx*2] - movq mm4, mm3 - movq mm5, mm3 - - psrlw mm3, 11 // red2 bits: mm3 = 001f001f001f001f - pand mm4, mm6 // green2 bits: mm4 = 07e007e007e007e0 - pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f - - paddw mm0, mm3 - psrlw mm0, 1 // (red1+red2)/2 - psllw mm0, 11 // red bits at f800f800f800f800 - - paddw mm1, mm4 - psrlw mm1, 1 // (green1+green2)/2 - pand mm1, mm6 // green bits at 03e003e003e003e0 - - paddw mm2, mm5 - psrlw mm2, 1 // (blue1+blue2)/2 - // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) - - por mm0, mm1 - por mm0, mm2 - - movq [esi+ebx], mm0 - - lea esi, [esi+8] - - dec ecx - jnz AvgLines565_loop - - mov tmp, esi - } -#else - for(ptrdiff_t wd=(pitch>>3);wd--;tmp++) - { - tmp[0] = - ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)| - ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)| - ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); - } -#endif - - for(ptrdiff_t i = (pitch&7)>>1; i--; tmp++) - { - tmp[pitch] = - ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)| - ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)| - ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); - } - } - - if(!(h&1) && h >= 2) - { - dst += (h-2)*pitch; - memcpy_accel(dst + pitch, dst, pitch); - } - -#ifndef _WIN64 - __asm emms; -#endif -} - -#ifndef _WIN64 -extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709); -extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709); -#endif - -bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch) -{ - void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL; - -#ifndef _WIN64 - if(g_cpuid.m_flags & CCpuID::mmx) - { - YUY2toRGB = - dbpp == 32 ? mmx_YUY2toRGB32 : - dbpp == 24 ? mmx_YUY2toRGB24 : - // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO - NULL; - } - else -#endif - { - ASSERT(FALSE); - // TODO - } - - if(!YUY2toRGB) - return(false); - - YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false); - - return(true); -} diff --git a/src/DSUtil/vd.h b/src/DSUtil/vd.h index a69e406c0..0db586cec 100644 --- a/src/DSUtil/vd.h +++ b/src/DSUtil/vd.h @@ -1,5 +1,6 @@ // VirtualDub - Video processing and capture application -// Copyright (C) 1998-2001 Avery Lee +// Graphics support library +// Copyright (C) 1998-2007 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -16,25 +17,22 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. // // Notes: -// - BitBltFromI420ToRGB is from VirtualDub -// - BitBltFromYUY2ToRGB is from AviSynth 2.52 +// - VDPixmapBlt is from VirtualDub +// - sse2 yv12 to yuy2 conversion by Haali // (- vd.cpp/h should be renamed to something more sensible already :) #pragma once -class CCpuID {public: CCpuID(); enum flag_t {mmx=1, ssemmx=2, ssefpu=4, sse2=8, _3dnow=16, sse3=32} m_flags;}; +class CCpuID {public: CCpuID(); enum flag_t {mmx=1, ssemmx=2, ssefpu=4, sse2=8, _3dnow=16} m_flags;}; extern CCpuID g_cpuid; extern bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch); -extern bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced = false); +extern bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch); +extern bool BitBltFromI420ToYUY2Interlaced(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch); extern bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch /* TODO: , bool fInterlaced = false */); extern bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch); extern bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch); extern bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp); extern void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch); -extern void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield); - -extern void AvgLines8(BYTE* dst, DWORD h, DWORD pitch); -extern void AvgLines555(BYTE* dst, DWORD h, DWORD pitch); -extern void AvgLines565(BYTE* dst, DWORD h, DWORD pitch); \ No newline at end of file +extern void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield); \ No newline at end of file diff --git a/src/DSUtil/vd_asm.cpp b/src/DSUtil/vd_asm.cpp index 851449089..3fc521844 100644 --- a/src/DSUtil/vd_asm.cpp +++ b/src/DSUtil/vd_asm.cpp @@ -1,5 +1,6 @@ // VirtualDub - Video processing and capture application -// Copyright (C) 1998-2001 Avery Lee +// Graphics support library +// Copyright (C) 1998-2007 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -16,7 +17,7 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. // // Notes: -// - BitBltFromI420ToRGB is from VirtualDub +// - VDPixmapBlt is from VirtualDub // - sse2 yv12 to yuy2 conversion by Haali // (- vd.cpp/h should be renamed to something more sensible already :) @@ -428,289 +429,4 @@ last4: ret }; } - -void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) -{ - static const __int64 _x0001000100010001 = 0x0001000100010001; - - __asm { - push ebp - push edi - push esi - push ebx - - mov edi,[esp+20] - mov esi,[esp+24] - sub edi,esi - mov ebp,[esp+28] - mov edx,[esp+32] - - shr ebp, 3 - - movq mm6, _x0001000100010001 - pxor mm7, mm7 - -xloop: - movq mm0, [esi] - movq mm3, mm0 - punpcklbw mm0, mm7 - punpckhbw mm3, mm7 - - movq mm1, [esi+edx] - movq mm4, mm1 - punpcklbw mm1, mm7 - punpckhbw mm4, mm7 - - paddw mm1, mm0 - paddw mm1, mm6 - psrlw mm1, 1 - - paddw mm4, mm3 - paddw mm4, mm6 - psrlw mm4, 1 - - add esi, 8 - packuswb mm1, mm4 - movq [edi+esi-8], mm1 - - dec ebp - jne xloop - - pop ebx - pop esi - pop edi - pop ebp - ret - }; -} - -void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) -{ - static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64; - static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64; - static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64; - static const __int64 _x0002000200020002 = 0x0002000200020002; - - __asm { - push ebp - push edi - push esi - push ebx - - mov edi, [esp+20] - mov esi, [esp+24] - sub edi, esi - mov ebp, [esp+28] - mov edx, [esp+32] - - shr ebp, 3 - - movq mm6, _x0002000200020002 - pxor mm7, mm7 - -xloop: - movq mm0, [esi] - movq mm3, mm0 - punpcklbw mm0, mm7 - punpckhbw mm3, mm7 - - movq mm1, [esi+edx] - movq mm4, mm1 - punpcklbw mm1, mm7 - punpckhbw mm4, mm7 - - movq mm2, [esi+edx*2] - movq mm5, mm2 - punpcklbw mm2, mm7 - punpckhbw mm5, mm7 - - psllw mm1, 1 - paddw mm1, mm0 - paddw mm1, mm2 - paddw mm1, mm6 - psrlw mm1, 2 - - psllw mm4, 1 - paddw mm4, mm3 - paddw mm4, mm5 - paddw mm4, mm6 - psrlw mm4, 2 - - add esi, 8 - packuswb mm1, mm4 - movq [edi+esi-8], mm1 - - dec ebp - jne xloop - - // sadly the original code makes a lot of visible banding artifacts on yuv - // (it seems those shiftings without rounding introduce too much error) -/* - mov edi,[esp+20] - mov esi,[esp+24] - sub edi,esi - mov ebp,[esp+28] - mov edx,[esp+32] - - movq mm5,mask0 - movq mm6,mask1 - movq mm7,mask2 - shr ebp,1 - jz oddpart - -xloop: - movq mm2,[esi] - movq mm0,mm5 - - movq mm1,[esi+edx] - pand mm0,mm2 - - psrlq mm1,1 - movq mm2,[esi+edx*2] - - psrlq mm2,2 - pand mm1,mm6 - - psrlq mm0,2 - pand mm2,mm7 - - paddb mm0,mm1 - add esi,8 - - paddb mm0,mm2 - dec ebp - - movq [edi+esi-8],mm0 - jne xloop - -oddpart: - test byte ptr [esp+28],1 - jz nooddpart - - mov ecx,[esi] - mov eax,0fcfcfcfch - mov ebx,[esi+edx] - and eax,ecx - shr ebx,1 - mov ecx,[esi+edx*2] - shr ecx,2 - and ebx,07f7f7f7fh - shr eax,2 - and ecx,03f3f3f3fh - add eax,ebx - add eax,ecx - mov [edi+esi],eax - -nooddpart: -*/ - pop ebx - pop esi - pop edi - pop ebp - ret - }; -} - -__declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; - -void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) -{ - __asm - { - mov edx, srcpitch - mov esi, src - mov edi, dst - sub edi, esi - mov ecx, w - mov ebx, ecx - shr ecx, 4 - and ebx, 15 - - movdqa xmm7, [const_1_16_bytes] - -asm_blend_row_SSE2_loop: - movdqa xmm0, [esi] - movdqa xmm1, [esi+edx] - movdqa xmm2, [esi+edx*2] - pavgb xmm0, xmm1 - pavgb xmm2, xmm1 - psubusb xmm0, xmm7 - pavgb xmm0, xmm2 - movdqa [esi+edi], xmm0 - add esi, 16 - dec ecx - jnz asm_blend_row_SSE2_loop - - test ebx,15 - jz asm_blend_row_SSE2_end - - mov ecx, ebx - xor ax, ax - xor bx, bx - xor dx, dx -asm_blend_row_SSE2_loop2: - mov al, [esi] - mov bl, [esi+edx] - mov dl, [esi+edx*2] - add ax, bx - inc ax - shr ax, 1 - add dx, bx - inc dx - shr dx, 1 - add ax, dx - shr ax, 1 - mov [esi+edi], al - inc esi - dec ecx - jnz asm_blend_row_SSE2_loop2 - -asm_blend_row_SSE2_end: - } -} - -void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) -{ - __asm - { - mov edx, srcpitch - mov esi, src - mov edi, dst - sub edi, esi - mov ecx, w - mov ebx, ecx - shr ecx, 4 - and ebx, 15 - - movdqa xmm7, [const_1_16_bytes] - -asm_blend_row_clipped_SSE2_loop: - movdqa xmm0, [esi] - movdqa xmm1, [esi+edx] - pavgb xmm0, xmm1 - movdqa [esi+edi], xmm0 - add esi, 16 - dec ecx - jnz asm_blend_row_clipped_SSE2_loop - - test ebx,15 - jz asm_blend_row_clipped_SSE2_end - - mov ecx, ebx - xor ax, ax - xor bx, bx -asm_blend_row_clipped_SSE2_loop2: - mov al, [esi] - mov bl, [esi+edx] - add ax, bx - inc ax - shr ax, 1 - mov [esi+edi], al - inc esi - dec ecx - jnz asm_blend_row_clipped_SSE2_loop2 - -asm_blend_row_clipped_SSE2_end: - } -} #endif diff --git a/src/DSUtil/vd_asm.h b/src/DSUtil/vd_asm.h index c1c78f39b..7c1f2f134 100644 --- a/src/DSUtil/vd_asm.h +++ b/src/DSUtil/vd_asm.h @@ -1,5 +1,6 @@ // VirtualDub - Video processing and capture application -// Copyright (C) 1998-2001 Avery Lee +// Graphics support library +// Copyright (C) 1998-2007 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -16,8 +17,7 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. // // Notes: -// - BitBltFromI420ToRGB is from VirtualDub -// - BitBltFromYUY2ToRGB is from AviSynth 2.52 +// - VDPixmapBlt is from VirtualDub // (- vd.cpp/h should be renamed to something more sensible already :) #pragma once @@ -31,9 +31,4 @@ void yv12_yuy2_row_sse2_linear(); void yv12_yuy2_row_sse2_linear_interlaced(); void yv12_yuy2_sse2(const BYTE *Y, const BYTE *U, const BYTE *V, int halfstride, unsigned halfwidth, unsigned height, BYTE *YUY2, int d_stride); void yv12_yuy2_sse2_interlaced(const BYTE *Y, const BYTE *U, const BYTE *V, int halfstride, unsigned halfwidth, unsigned height, BYTE *YUY2, int d_stride); - -void asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch); -void asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch); -void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch); -void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch); #endif -- cgit v1.2.3