Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkinddragon <kinddragon@users.sourceforge.net>2010-05-21 07:23:13 +0400
committerkinddragon <kinddragon@users.sourceforge.net>2010-05-21 07:23:13 +0400
commit2f3e47055017d851b693a9fbcfcca25d03648c81 (patch)
tree8a826c3f947c0132f948337283d19ab3f02fd0fa /src/DSUtil
parent37f62abd654047d060c86d6c76cd2f6862f89b94 (diff)
Added new ELA deinterlacing to MPEG2 codec
First corrupted frame with deintrlacing fixed with MPEG2 codec MPEG2 setting check box fixed git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1908 10f7b99b-c216-0410-bff0-8a66a9350fd8
Diffstat (limited to 'src/DSUtil')
-rw-r--r--src/DSUtil/deinterlace.cpp1476
-rw-r--r--src/DSUtil/vd.cpp4
-rw-r--r--src/DSUtil/vd.h4
3 files changed, 1462 insertions, 22 deletions
diff --git a/src/DSUtil/deinterlace.cpp b/src/DSUtil/deinterlace.cpp
index a66915dfd..1a7b47da0 100644
--- a/src/DSUtil/deinterlace.cpp
+++ b/src/DSUtil/deinterlace.cpp
@@ -1,5 +1,5 @@
// VirtualDub - Video processing and capture application
-// Copyright (C) 1998-2001 Avery Lee
+// Copyright (C) 1998-2007 Avery Lee
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@@ -17,21 +17,9 @@
#include "stdafx.h"
#include <emmintrin.h>
+#include <vd2/system/memory.h>
#include <vd2/system/cpuaccel.h>
-
-#define uint8 unsigned char
-#define uint32 unsigned int
-#define uint64 unsigned __int64
-
-#ifdef _M_IX86
-#define VD_CPU_X86
-#endif
-
-#ifdef _M_X64
-#define VD_CPU_AMD64
-#endif
-
-///////////////////////////////////////////////////////////////////////////
+#include <vd2/system/vdstl.h>
#pragma warning(disable: 4799) // warning C4799: function has no EMMS instruction
@@ -271,9 +259,1449 @@ static void asm_blend_row(void *dst0, const void *src0, uint32 w, ptrdiff_t srcp
} while(--w);
}
+
+ void ela_L8_SSE2(__m128i *dst, const __m128i *srcat, const __m128i *srcab, int w16) {
+ do {
+ __m128i top0 = srcat[0];
+ __m128i top1 = srcat[1];
+ __m128i top2 = srcat[2];
+ __m128i bot0 = srcab[0];
+ __m128i bot1 = srcab[1];
+ __m128i bot2 = srcab[2];
+ ++srcat;
+ ++srcab;
+
+ __m128i topl2 = _mm_or_si128(_mm_srli_si128(top0, 16 - 3), _mm_slli_si128(top1, 3));
+ __m128i topl1 = _mm_or_si128(_mm_srli_si128(top0, 16 - 2), _mm_slli_si128(top1, 2));
+ __m128i topc0 = _mm_or_si128(_mm_srli_si128(top0, 16 - 1), _mm_slli_si128(top1, 1));
+ __m128i topr1 = top1;
+ __m128i topr2 = _mm_or_si128(_mm_srli_si128(top1, 1), _mm_slli_si128(top2, 16 - 1));
+ __m128i topr3 = _mm_or_si128(_mm_srli_si128(top1, 2), _mm_slli_si128(top2, 16 - 2));
+
+ __m128i botl2 = _mm_or_si128(_mm_srli_si128(bot0, 16 - 3), _mm_slli_si128(bot1, 3));
+ __m128i botl1 = _mm_or_si128(_mm_srli_si128(bot0, 16 - 2), _mm_slli_si128(bot1, 2));
+ __m128i botc0 = _mm_or_si128(_mm_srli_si128(bot0, 16 - 1), _mm_slli_si128(bot1, 1));
+ __m128i botr1 = bot1;
+ __m128i botr2 = _mm_or_si128(_mm_srli_si128(bot1, 1), _mm_slli_si128(bot2, 16 - 1));
+ __m128i botr3 = _mm_or_si128(_mm_srli_si128(bot1, 2), _mm_slli_si128(bot2, 16 - 2));
+
+ __m128i rawscorec0 = _mm_or_si128(_mm_subs_epu8(topc0, botc0), _mm_subs_epu8(botc0, topc0));
+ __m128i rawscorel1 = _mm_or_si128(_mm_subs_epu8(topl1, botr1), _mm_subs_epu8(botr1, topl1));
+ __m128i rawscorel2 = _mm_or_si128(_mm_subs_epu8(topl2, botr2), _mm_subs_epu8(botr2, topl2));
+ __m128i rawscorer1 = _mm_or_si128(_mm_subs_epu8(topr1, botl1), _mm_subs_epu8(botl1, topr1));
+ __m128i rawscorer2 = _mm_or_si128(_mm_subs_epu8(topr2, botl2), _mm_subs_epu8(botl2, topr2));
+
+ dst[0] = rawscorec0;
+ dst[1] = rawscorel1;
+ dst[2] = rawscorel2;
+ dst[3] = rawscorer1;
+ dst[4] = rawscorer2;
+ dst[5] = _mm_avg_epu8(topr1, botr1);
+ dst[6] = _mm_avg_epu8(topc0, botr2);
+ dst[7] = _mm_avg_epu8(topl1, botr3);
+ dst[8] = _mm_avg_epu8(topr2, botc0);
+ dst[9] = _mm_avg_epu8(topr3, botl1);
+ dst += 10;
+ } while(--w16);
+ }
+
+ void nela_L8_SSE2(__m128i *dst, const __m128i *elabuf, int w16) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i x80b = _mm_set1_epi8((char)0x80);
+
+ do {
+ __m128i x0, x1, x2, y;
+
+ x0 = elabuf[0];
+ y = elabuf[10];
+ x1 = _mm_or_si128(_mm_srli_si128(x0, 1), _mm_slli_si128(y, 15));
+ x2 = _mm_or_si128(_mm_srli_si128(x0, 2), _mm_slli_si128(y, 14));
+ __m128i scorec0 = _mm_avg_epu8(_mm_avg_epu8(x0, x2), x1);
+
+ x0 = elabuf[1];
+ y = elabuf[11];
+ x1 = _mm_or_si128(_mm_srli_si128(x0, 1), _mm_slli_si128(y, 15));
+ x2 = _mm_or_si128(_mm_srli_si128(x0, 2), _mm_slli_si128(y, 14));
+ __m128i scorel1 = _mm_avg_epu8(_mm_avg_epu8(x0, x2), x1);
+
+ x0 = elabuf[2];
+ y = elabuf[12];
+ x1 = _mm_or_si128(_mm_srli_si128(x0, 1), _mm_slli_si128(y, 15));
+ x2 = _mm_or_si128(_mm_srli_si128(x0, 2), _mm_slli_si128(y, 14));
+ __m128i scorel2 = _mm_avg_epu8(_mm_avg_epu8(x0, x2), x1);
+
+ x0 = elabuf[3];
+ y = elabuf[13];
+ x1 = _mm_or_si128(_mm_srli_si128(x0, 1), _mm_slli_si128(y, 15));
+ x2 = _mm_or_si128(_mm_srli_si128(x0, 2), _mm_slli_si128(y, 14));
+ __m128i scorer1 = _mm_avg_epu8(_mm_avg_epu8(x0, x2), x1);
+
+ x0 = elabuf[4];
+ y = elabuf[14];
+ x1 = _mm_or_si128(_mm_srli_si128(x0, 1), _mm_slli_si128(y, 15));
+ x2 = _mm_or_si128(_mm_srli_si128(x0, 2), _mm_slli_si128(y, 14));
+ __m128i scorer2 = _mm_avg_epu8(_mm_avg_epu8(x0, x2), x1);
+
+ scorec0 = _mm_xor_si128(scorec0, x80b);
+ scorel1 = _mm_xor_si128(scorel1, x80b);
+ scorel2 = _mm_xor_si128(scorel2, x80b);
+ scorer1 = _mm_xor_si128(scorer1, x80b);
+ scorer2 = _mm_xor_si128(scorer2, x80b);
+
+ // result = (scorel1 < scorec0) ? (scorel2 < scorel1 ? l2 : l1) : (scorer1 < scorec0) ? (scorer2 < scorer1 ? r2 : r1) : c0
+
+ __m128i cmplt_l1_c0 = _mm_cmplt_epi8(scorel1, scorec0);
+ __m128i cmplt_r1_c0 = _mm_cmplt_epi8(scorer1, scorec0);
+ __m128i cmplt_l1_r1 = _mm_cmplt_epi8(scorel1, scorer1);
+
+ __m128i is_l1 = _mm_and_si128(cmplt_l1_r1, cmplt_l1_c0);
+ __m128i is_r1 = _mm_andnot_si128(cmplt_l1_r1, cmplt_r1_c0);
+ __m128i is_c0_inv = _mm_or_si128(cmplt_l1_c0, cmplt_r1_c0);
+ __m128i is_c0 = _mm_andnot_si128(is_c0_inv, _mm_cmpeq_epi8(zero, zero));
+
+ __m128i is_l2 = _mm_and_si128(is_l1, _mm_cmplt_epi8(scorel2, scorel1));
+ __m128i is_r2 = _mm_and_si128(is_r1, _mm_cmplt_epi8(scorer2, scorer1));
+
+ is_l1 = _mm_andnot_si128(is_l2, is_l1);
+ is_r1 = _mm_andnot_si128(is_r2, is_r1);
+
+ __m128i mask_c0 = is_c0;
+ __m128i mask_l1 = is_l1;
+ __m128i mask_l2 = is_l2;
+ __m128i mask_r1 = is_r1;
+ __m128i mask_r2 = is_r2;
+
+ __m128i result_c0 = _mm_and_si128(elabuf[5], mask_c0);
+ __m128i result_l1 = _mm_and_si128(elabuf[6], mask_l1);
+ __m128i result_l2 = _mm_and_si128(elabuf[7], mask_l2);
+ __m128i result_r1 = _mm_and_si128(elabuf[8], mask_r1);
+ __m128i result_r2 = _mm_and_si128(elabuf[9], mask_r2);
+
+ elabuf += 10;
+
+ __m128i pred = _mm_or_si128(_mm_or_si128(_mm_or_si128(result_l1, result_l2), _mm_or_si128(result_r1, result_r2)), result_c0);
+
+ *dst++ = pred;
+ } while(--w16);
+ }
+#endif
+
+#if defined(VD_CPU_X86)
+ void __declspec(naked) __stdcall ela_L8_MMX(void *dst, const void *srcat, const void *srcab, int w16) {
+ static const __declspec(align(8)) uint64 xFEb = 0xfefefefefefefefe;
+ __asm {
+ push ebx
+ mov ebx, [esp + 4 + 4]
+ mov ecx, [esp + 8 + 4]
+ mov edx, [esp + 12 + 4]
+ mov eax, [esp + 16 + 4]
+ movq mm7, xFEb
+
+xloop:
+ movq mm0, [ecx + 15]
+ movq mm1, [edx + 15]
+ movq mm4, mm0
+ movq mm2, [ecx + 15 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 15 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx], mm0
+ por mm2, mm3
+ movq [ebx + 8], mm2
+
+ movq mm0, [ecx + 14]
+ movq mm1, [edx + 16]
+ movq mm4, mm0
+ movq mm2, [ecx + 14 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 16 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 16], mm0
+ por mm2, mm3
+ movq [ebx + 24], mm2
+
+ movq mm0, [ecx + 13]
+ movq mm1, [edx + 17]
+ movq mm4, mm0
+ movq mm2, [ecx + 13 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 17 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 32], mm0
+ por mm2, mm3
+ movq [ebx + 40], mm2
+
+ movq mm0, [ecx + 16]
+ movq mm1, [edx + 14]
+ movq mm4, mm0
+ movq mm2, [ecx + 16 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 14 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 48], mm0
+ por mm2, mm3
+ movq [ebx + 56], mm2
+
+ movq mm0, [ecx + 17]
+ movq mm1, [edx + 13]
+ movq mm4, mm0
+ movq mm2, [ecx + 17 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 13 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 64], mm0
+ por mm2, mm3
+ movq [ebx + 72], mm2
+
+ movq mm0, [ecx + 16]
+ movq mm1, [edx + 16]
+ movq mm2, [ecx + 16 + 8]
+ movq mm3, [edx + 16 + 8]
+ movq mm4, mm0
+ movq mm5, mm2
+ pxor mm0, mm1
+ pxor mm2, mm3
+ por mm1, mm4
+ por mm3, mm5
+ pand mm0, mm7
+ pand mm2, mm7
+ psrlw mm0, 1
+ psrlw mm2, 1
+ psubb mm1, mm0
+ psubb mm3, mm2
+ movq [ebx + 80], mm1
+ movq [ebx + 88], mm3
+
+ movq mm0, [ecx + 15]
+ movq mm1, [edx + 17]
+ movq mm2, [ecx + 15 + 8]
+ movq mm3, [edx + 17 + 8]
+ movq mm4, mm0
+ movq mm5, mm2
+ pxor mm0, mm1
+ pxor mm2, mm3
+ por mm1, mm4
+ por mm3, mm5
+ pand mm0, mm7
+ pand mm2, mm7
+ psrlw mm0, 1
+ psrlw mm2, 1
+ psubb mm1, mm0
+ psubb mm3, mm2
+ movq [ebx + 96], mm1
+ movq [ebx + 104], mm3
+
+ movq mm0, [ecx + 14]
+ movq mm1, [edx + 18]
+ movq mm2, [ecx + 14 + 8]
+ movq mm3, [edx + 18 + 8]
+ movq mm4, mm0
+ movq mm5, mm2
+ pxor mm0, mm1
+ pxor mm2, mm3
+ por mm1, mm4
+ por mm3, mm5
+ pand mm0, mm7
+ pand mm2, mm7
+ psrlw mm0, 1
+ psrlw mm2, 1
+ psubb mm1, mm0
+ psubb mm3, mm2
+ movq [ebx + 112], mm1
+ movq [ebx + 120], mm3
+
+ movq mm0, [ecx + 17]
+ movq mm1, [edx + 15]
+ movq mm2, [ecx + 17 + 8]
+ movq mm3, [edx + 15 + 8]
+ movq mm4, mm0
+ movq mm5, mm2
+ pxor mm0, mm1
+ pxor mm2, mm3
+ por mm1, mm4
+ por mm3, mm5
+ pand mm0, mm7
+ pand mm2, mm7
+ psrlw mm0, 1
+ psrlw mm2, 1
+ psubb mm1, mm0
+ psubb mm3, mm2
+ movq [ebx + 128], mm1
+ movq [ebx + 136], mm3
+
+ movq mm0, [ecx + 18]
+ movq mm1, [edx + 14]
+ movq mm2, [ecx + 18 + 8]
+ movq mm3, [edx + 14 + 8]
+ movq mm4, mm0
+ movq mm5, mm2
+ pxor mm0, mm1
+ pxor mm2, mm3
+ por mm1, mm4
+ por mm3, mm5
+ pand mm0, mm7
+ pand mm2, mm7
+ psrlw mm0, 1
+ psrlw mm2, 1
+ psubb mm1, mm0
+ psubb mm3, mm2
+ movq [ebx + 144], mm1
+ movq [ebx + 152], mm3
+
+ add ebx, 160
+ add ecx, 16
+ add edx, 16
+ dec eax
+ jne xloop
+
+ emms
+ pop ebx
+ ret 16
+ }
+ }
+
+ void __declspec(naked) __stdcall ela_L8_ISSE(void *dst, const void *srcat, const void *srcab, int w16) {
+ static const __declspec(align(8)) uint64 xFEb = 0xfefefefefefefefe;
+ __asm {
+ push ebx
+ mov ebx, [esp + 4 + 4]
+ mov ecx, [esp + 8 + 4]
+ mov edx, [esp + 12 + 4]
+ mov eax, [esp + 16 + 4]
+ movq mm7, xFEb
+
+xloop:
+ movq mm0, [ecx + 15]
+ movq mm1, [edx + 15]
+ movq mm4, mm0
+ movq mm2, [ecx + 15 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 15 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx], mm0
+ por mm2, mm3
+ movq [ebx + 8], mm2
+
+ movq mm0, [ecx + 14]
+ movq mm1, [edx + 16]
+ movq mm4, mm0
+ movq mm2, [ecx + 14 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 16 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 16], mm0
+ por mm2, mm3
+ movq [ebx + 24], mm2
+
+ movq mm0, [ecx + 13]
+ movq mm1, [edx + 17]
+ movq mm4, mm0
+ movq mm2, [ecx + 13 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 17 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 32], mm0
+ por mm2, mm3
+ movq [ebx + 40], mm2
+
+ movq mm0, [ecx + 16]
+ movq mm1, [edx + 14]
+ movq mm4, mm0
+ movq mm2, [ecx + 16 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 14 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 48], mm0
+ por mm2, mm3
+ movq [ebx + 56], mm2
+
+ movq mm0, [ecx + 17]
+ movq mm1, [edx + 13]
+ movq mm4, mm0
+ movq mm2, [ecx + 17 + 8]
+ psubusb mm0, mm1
+ movq mm3, [edx + 13 + 8]
+ movq mm5, mm2
+ psubusb mm2, mm3
+ psubusb mm1, mm4
+ psubusb mm3, mm5
+ por mm0, mm1
+ movq [ebx + 64], mm0
+ por mm2, mm3
+ movq [ebx + 72], mm2
+
+ movq mm0, [ecx + 16]
+ movq mm1, [edx + 16]
+ movq mm2, [ecx + 16 + 8]
+ movq mm3, [edx + 16 + 8]
+ pavgb mm1, mm0
+ pavgb mm3, mm2
+ movq [ebx + 80], mm1
+ movq [ebx + 88], mm3
+
+ movq mm0, [ecx + 15]
+ movq mm1, [edx + 17]
+ movq mm2, [ecx + 15 + 8]
+ movq mm3, [edx + 17 + 8]
+ pavgb mm1, mm0
+ pavgb mm3, mm2
+ movq [ebx + 96], mm1
+ movq [ebx + 104], mm3
+
+ movq mm0, [ecx + 14]
+ movq mm1, [edx + 18]
+ movq mm2, [ecx + 14 + 8]
+ movq mm3, [edx + 18 + 8]
+ pavgb mm1, mm0
+ pavgb mm3, mm2
+ movq [ebx + 112], mm1
+ movq [ebx + 120], mm3
+
+ movq mm0, [ecx + 17]
+ movq mm1, [edx + 15]
+ movq mm2, [ecx + 17 + 8]
+ movq mm3, [edx + 15 + 8]
+ pavgb mm1, mm0
+ pavgb mm3, mm2
+ movq [ebx + 128], mm1
+ movq [ebx + 136], mm3
+
+ movq mm0, [ecx + 18]
+ movq mm1, [edx + 14]
+ movq mm2, [ecx + 18 + 8]
+ movq mm3, [edx + 14 + 8]
+ pavgb mm1, mm0
+ pavgb mm3, mm2
+ movq [ebx + 144], mm1
+ movq [ebx + 152], mm3
+
+ add ebx, 160
+ add ecx, 16
+ add edx, 16
+ dec eax
+ jne xloop
+
+ emms
+ pop ebx
+ ret 16
+ }
+ }
+
+ void __declspec(naked) __stdcall nela_L8_ISSE(void *dst, const void *elabuf, int w16) {
+ static const __declspec(align(8)) uint64 x7fb = 0x7f7f7f7f7f7f7f7f;
+ __asm {
+ mov edx, [esp+4]
+ mov ecx, [esp+8]
+ mov eax, [esp+12]
+xloop:
+ movq mm0, [ecx+000h]
+ pavgb mm0, [ecx+002h]
+ pavgb mm0, [ecx+001h]
+
+ movq mm1, [ecx+010h]
+ pavgb mm1, [ecx+012h]
+ pavgb mm1, [ecx+011h]
+
+ movq mm2, [ecx+020h]
+ pavgb mm2, [ecx+022h]
+ pavgb mm2, [ecx+021h]
+
+ movq mm3, [ecx+030h]
+ pavgb mm3, [ecx+032h]
+ pavgb mm3, [ecx+031h]
+
+ movq mm4, [ecx+040h]
+ pavgb mm4, [ecx+042h]
+ pavgb mm4, [ecx+041h]
+
+ movq mm5, x7fb
+ pxor mm0, mm5
+ pxor mm1, mm5
+ pxor mm2, mm5
+ pxor mm3, mm5
+ pxor mm4, mm5
+
+ ;mm0 = scorec0
+ ;mm1 = scorel1
+ ;mm2 = scorel2
+ ;mm3 = scorer1
+ ;mm4 = scorer2
+
+ movq mm5, mm3
+ pcmpgtb mm5, mm1 ;(scorer1 > scorel1) == (scorel1 < scorer1)
+
+ pcmpgtb mm4, mm3 ;scorer2 > scorer1
+ pcmpgtb mm2, mm1 ;scorel2 > scorel1
+
+ pcmpgtb mm1, mm0 ;scorel1 > scorec0
+ pcmpgtb mm3, mm0 ;scorer1 > scorec0
+
+ movq mm6, mm1
+ pcmpeqb mm0, mm0
+ por mm6, mm3 ;scorel1 > scorec0 || scorer1 > scorec0
+ pxor mm0, mm6 ;mask_c0
+
+ pand mm3, mm5 ;scorer1 > scorec0 && scorer1 > scorel1
+ pandn mm5, mm1 ;scorel1 > scorec0 && scorel1 >= scorer1
+
+ pand mm4, mm3 ;mask_r2
+ pand mm2, mm5 ;mask_l2
+
+ pxor mm3, mm4 ;mask_r1
+ pxor mm5, mm2 ;mask_l1
+
+ pand mm0, [ecx+050h]
+ pand mm5, [ecx+060h]
+ pand mm2, [ecx+070h]
+ pand mm3, [ecx+080h]
+ pand mm4, [ecx+090h]
+ por mm0, mm5
+ por mm2, mm3
+ por mm0, mm4
+ por mm0, mm2
+ movq [edx], mm0
+
+ movq mm0, [ecx+008h]
+ movq mm5, [ecx+0a0h]
+ movq mm6, mm0
+ psrlq mm0, 16
+ movq mm7, mm5
+ psllq mm5, 48
+ por mm0, mm5
+ psrlq mm6, 8
+ psllq mm7, 56
+ por mm6, mm7
+ pavgb mm0, [ecx+008h]
+ pavgb mm0, mm6
+
+ movq mm1, [ecx+018h]
+ movq mm5, [ecx+0b0h]
+ movq mm6, mm1
+ psrlq mm1, 16
+ movq mm7, mm5
+ psllq mm5, 48
+ por mm1, mm5
+ psrlq mm6, 8
+ psllq mm7, 56
+ por mm6, mm7
+ pavgb mm1, [ecx+018h]
+ pavgb mm1, mm6
+
+ movq mm2, [ecx+028h]
+ movq mm5, [ecx+0c0h]
+ movq mm6, mm2
+ psrlq mm2, 16
+ movq mm7, mm5
+ psllq mm5, 48
+ por mm2, mm5
+ psrlq mm6, 8
+ psllq mm7, 56
+ por mm6, mm7
+ pavgb mm2, [ecx+028h]
+ pavgb mm2, mm6
+
+ movq mm3, [ecx+038h]
+ movq mm5, [ecx+0d0h]
+ movq mm6, mm3
+ psrlq mm3, 16
+ movq mm7, mm5
+ psllq mm5, 48
+ por mm3, mm5
+ psrlq mm6, 8
+ psllq mm7, 56
+ por mm6, mm7
+ pavgb mm3, [ecx+038h]
+ pavgb mm3, mm6
+
+ movq mm4, [ecx+048h]
+ movq mm5, [ecx+0e0h]
+ movq mm6, mm4
+ psrlq mm4, 16
+ movq mm7, mm5
+ psllq mm5, 48
+ por mm4, mm5
+ psrlq mm6, 8
+ psllq mm7, 56
+ por mm6, mm7
+ pavgb mm4, [ecx+048h]
+ pavgb mm4, mm6
+
+ movq mm5, x7fb
+ pxor mm0, mm5
+ pxor mm1, mm5
+ pxor mm2, mm5
+ pxor mm3, mm5
+ pxor mm4, mm5
+
+ ;mm0 = scorec0
+ ;mm1 = scorel1
+ ;mm2 = scorel2
+ ;mm3 = scorer1
+ ;mm4 = scorer2
+
+ movq mm5, mm3
+ pcmpgtb mm5, mm1 ;(scorer1 > scorel1) == (scorel1 < scorer1)
+
+ pcmpgtb mm4, mm3 ;scorer2 > scorer1
+ pcmpgtb mm2, mm1 ;scorel2 > scorel1
+
+ pcmpgtb mm1, mm0 ;scorel1 > scorec0
+ pcmpgtb mm3, mm0 ;scorer1 > scorec0
+
+ movq mm6, mm1
+ pcmpeqb mm0, mm0
+ por mm6, mm3 ;scorel1 > scorec0 || scorer1 > scorec0
+ pxor mm0, mm6 ;mask_c0
+
+ pand mm3, mm5 ;scorer1 > scorec0 && scorer1 > scorel1
+ pandn mm5, mm1 ;scorel1 > scorec0 && scorel1 >= scorer1
+
+ pand mm4, mm3 ;mask_r2
+ pand mm2, mm5 ;mask_l2
+
+ pxor mm3, mm4 ;mask_r1
+ pxor mm5, mm2 ;mask_l1
+
+ pand mm0, [ecx+058h]
+ pand mm5, [ecx+068h]
+ pand mm2, [ecx+078h]
+ pand mm3, [ecx+088h]
+ pand mm4, [ecx+098h]
+ por mm0, mm5
+ por mm2, mm3
+ por mm0, mm4
+ por mm0, mm2
+ movq [edx+8], mm0
+
+ add ecx, 0a0h
+ add edx, 16
+
+ dec eax
+ jnz xloop
+
+ emms
+ ret 12
+ }
+ }
+
+ void __declspec(naked) __stdcall nela_L8_MMX(void *dst, const void *elabuf, int w16) {
+ static const __declspec(align(8)) uint64 x7fb = 0x7f7f7f7f7f7f7f7f;
+
+ __asm {
+ mov edx, [esp+4]
+ mov ecx, [esp+8]
+ mov eax, [esp+12]
+xloop:
+ movq mm0, [ecx+000h]
+ movq mm5, [ecx+002h]
+ movq mm6, mm0
+ pxor mm0, mm5
+ por mm6, mm5
+ movq mm7, [ecx+001h]
+ psrlq mm0, 1
+ pand mm0, x7fb
+ psubb mm6, mm0
+ movq mm0, mm6
+ pxor mm6, mm7
+ por mm0, mm7
+ psrlq mm6, 1
+ pand mm6, x7fb
+ psubb mm0, mm6
+
+ movq mm1, [ecx+010h]
+ movq mm5, [ecx+012h]
+ movq mm6, mm1
+ pxor mm1, mm5
+ por mm6, mm5
+ movq mm7, [ecx+011h]
+ psrlq mm1, 1
+ pand mm1, x7fb
+ psubb mm6, mm1
+ movq mm1, mm6
+ pxor mm6, mm7
+ por mm1, mm7
+ psrlq mm6, 1
+ pand mm6, x7fb
+ psubb mm1, mm6
+
+ movq mm2, [ecx+020h]
+ movq mm5, [ecx+022h]
+ movq mm6, mm2
+ pxor mm2, mm5
+ por mm6, mm5
+ movq mm7, [ecx+021h]
+ psrlq mm2, 1
+ pand mm2, x7fb
+ psubb mm6, mm2
+ movq mm2, mm6
+ pxor mm6, mm7
+ por mm2, mm7
+ psrlq mm6, 1
+ pand mm6, x7fb
+ psubb mm2, mm6
+
+ movq mm3, [ecx+030h]
+ movq mm5, [ecx+032h]
+ movq mm6, mm3
+ pxor mm3, mm5
+ por mm6, mm5
+ movq mm7, [ecx+031h]
+ psrlq mm3, 1
+ pand mm3, x7fb
+ psubb mm6, mm3
+ movq mm3, mm6
+ pxor mm6, mm7
+ por mm3, mm7
+ psrlq mm6, 1
+ pand mm6, x7fb
+ psubb mm3, mm6
+
+ movq mm4, [ecx+040h]
+ movq mm5, [ecx+042h]
+ movq mm6, mm4
+ pxor mm4, mm5
+ por mm6, mm5
+ movq mm7, [ecx+041h]
+ psrlq mm4, 1
+ pand mm4, x7fb
+ psubb mm6, mm4
+ movq mm4, mm6
+ pxor mm6, mm7
+ por mm4, mm7
+ psrlq mm6, 1
+ pand mm6, x7fb
+ psubb mm4, mm6
+
+
+ movq mm5, x7fb
+ pxor mm0, mm5
+ pxor mm1, mm5
+ pxor mm2, mm5
+ pxor mm3, mm5
+ pxor mm4, mm5
+
+ ;mm0 = scorec0
+ ;mm1 = scorel1
+ ;mm2 = scorel2
+ ;mm3 = scorer1
+ ;mm4 = scorer2
+
+ movq mm5, mm3
+ pcmpgtb mm5, mm1 ;(scorer1 > scorel1) == (scorel1 < scorer1)
+
+ pcmpgtb mm4, mm3 ;scorer2 > scorer1
+ pcmpgtb mm2, mm1 ;scorel2 > scorel1
+
+ pcmpgtb mm1, mm0 ;scorel1 > scorec0
+ pcmpgtb mm3, mm0 ;scorer1 > scorec0
+
+ movq mm6, mm1
+ pcmpeqb mm0, mm0
+ por mm6, mm3 ;scorel1 > scorec0 || scorer1 > scorec0
+ pxor mm0, mm6 ;mask_c0
+
+ pand mm3, mm5 ;scorer1 > scorec0 && scorer1 > scorel1
+ pandn mm5, mm1 ;scorel1 > scorec0 && scorel1 >= scorer1
+
+ pand mm4, mm3 ;mask_r2
+ pand mm2, mm5 ;mask_l2
+
+ pxor mm3, mm4 ;mask_r1
+ pxor mm5, mm2 ;mask_l1
+
+ pand mm0, [ecx+050h]
+ pand mm5, [ecx+060h]
+ pand mm2, [ecx+070h]
+ pand mm3, [ecx+080h]
+ pand mm4, [ecx+090h]
+ por mm0, mm5
+ por mm2, mm3
+ por mm0, mm4
+ por mm0, mm2
+
+ movq [edx], mm0
+
+ movq mm0, [ecx+008h] ;mm0 = x0
+ movq mm5, [ecx+0a0h] ;mm5 = x1
+ psrlq mm0, 16 ;mm0 = (x0 >> 16)
+ movq mm7, [ecx+008h] ;mm7 = y0 = x0
+ psllq mm5, 48 ;mm5 = (x1 << 48)
+ movq mm6, mm7 ;mm6 = y0 = x0
+ por mm0, mm5 ;mm0 = y2 = (x0 >> 16) | (x1 << 48)
+ pxor mm6, mm0 ;mm6 = y0 ^ y2
+ por mm7, mm0 ;mm7 = y0 | y2
+ movq mm5, [ecx+008h] ;mm5 = x0
+ psrlq mm6, 1 ;mm6 = (y0 ^ y2) >> 1
+ movq mm0, [ecx+0a0h] ;mm0 = x1
+ psrlq mm5, 8 ;mm5 = x0 >> 8
+ pand mm6, x7fb ;mm6 = ((y0 ^ y2) >> 1) & 0x7f7f7f7f7f7f7f7f
+ psllq mm0, 56 ;mm0 = x1 << 56
+ psubb mm7, mm6 ;mm7 = t = (y0 | y2) - (((y0 ^ y2) >> 1) & 0x7f7f7f7f7f7f7f7f) = avgb(y0, y2)
+ por mm0, mm5 ;mm0 = y1 = (x0 >> 8) | (x1 << 56)
+ movq mm6, mm7 ;mm6 = t
+ pxor mm7, mm0 ;mm7 = t ^ y1
+ por mm0, mm6 ;mm0 = t | y1
+ psrlq mm7, 1 ;mm7 = (t ^ y1) >> 1
+ pand mm7, x7fb ;mm7 = ((t ^ y1) >> 1) & 0x7f7f7f7f7f7f7f7f
+ psubb mm0, mm7 ;mm0 = (t | y1) - (((t ^ y1) >> 1) & 0x7f7f7f7f7f7f7f7f) = avgb(y1, avgb(y0, y2))
+
+ movq mm1, [ecx+018h]
+ movq mm5, [ecx+0b0h]
+ psrlq mm1, 16
+ movq mm7, [ecx+018h]
+ psllq mm5, 48
+ movq mm6, mm7
+ por mm1, mm5
+ pxor mm6, mm1
+ por mm7, mm1
+ movq mm5, [ecx+018h]
+ psrlq mm6, 1
+ movq mm1, [ecx+0b0h]
+ psrlq mm5, 8
+ pand mm6, x7fb
+ psllq mm1, 56
+ psubb mm7, mm6
+ por mm1, mm5
+ movq mm6, mm7
+ pxor mm7, mm1
+ por mm1, mm6
+ psrlq mm7, 1
+ pand mm7, x7fb
+ psubb mm1, mm7
+
+ movq mm2, [ecx+028h]
+ movq mm5, [ecx+0c0h]
+ psrlq mm2, 16
+ movq mm7, [ecx+028h]
+ psllq mm5, 48
+ movq mm6, mm7
+ por mm2, mm5
+ pxor mm6, mm2
+ por mm7, mm2
+ movq mm5, [ecx+028h]
+ psrlq mm6, 1
+ movq mm2, [ecx+0c0h]
+ psrlq mm5, 8
+ pand mm6, x7fb
+ psllq mm2, 56
+ psubb mm7, mm6
+ por mm2, mm5
+ movq mm6, mm7
+ pxor mm7, mm2
+ por mm2, mm6
+ psrlq mm7, 1
+ pand mm7, x7fb
+ psubb mm2, mm7
+
+ movq mm3, [ecx+038h]
+ movq mm5, [ecx+0d0h]
+ psrlq mm3, 16
+ movq mm7, [ecx+038h]
+ psllq mm5, 48
+ movq mm6, mm7
+ por mm3, mm5
+ pxor mm6, mm3
+ por mm7, mm3
+ movq mm5, [ecx+038h]
+ psrlq mm6, 1
+ movq mm3, [ecx+0d0h]
+ psrlq mm5, 8
+ pand mm6, x7fb
+ psllq mm3, 56
+ psubb mm7, mm6
+ por mm3, mm5
+ movq mm6, mm7
+ pxor mm7, mm3
+ por mm3, mm6
+ psrlq mm7, 1
+ pand mm7, x7fb
+ psubb mm3, mm7
+
+ movq mm4, [ecx+048h]
+ movq mm5, [ecx+0e0h]
+ psrlq mm4, 16
+ movq mm7, [ecx+048h]
+ psllq mm5, 48
+ movq mm6, mm7
+ por mm4, mm5
+ pxor mm6, mm4
+ por mm7, mm4
+ movq mm5, [ecx+048h]
+ psrlq mm6, 1
+ movq mm4, [ecx+0e0h]
+ psrlq mm5, 8
+ pand mm6, x7fb
+ psllq mm4, 56
+ psubb mm7, mm6
+ por mm4, mm5
+ movq mm6, mm7
+ pxor mm7, mm4
+ por mm4, mm6
+ psrlq mm7, 1
+ pand mm7, x7fb
+ psubb mm4, mm7
+
+ movq mm5, x7fb
+ pxor mm0, mm5
+ pxor mm1, mm5
+ pxor mm2, mm5
+ pxor mm3, mm5
+ pxor mm4, mm5
+
+ ;mm0 = scorec0
+ ;mm1 = scorel1
+ ;mm2 = scorel2
+ ;mm3 = scorer1
+ ;mm4 = scorer2
+
+ movq mm5, mm3
+ pcmpgtb mm5, mm1 ;(scorer1 > scorel1) == (scorel1 < scorer1)
+
+ pcmpgtb mm4, mm3 ;scorer2 > scorer1
+ pcmpgtb mm2, mm1 ;scorel2 > scorel1
+
+ pcmpgtb mm1, mm0 ;scorel1 > scorec0
+ pcmpgtb mm3, mm0 ;scorer1 > scorec0
+
+ movq mm6, mm1
+ pcmpeqb mm0, mm0
+ por mm6, mm3 ;scorel1 > scorec0 || scorer1 > scorec0
+ pxor mm0, mm6 ;mask_c0
+
+ pand mm3, mm5 ;scorer1 > scorec0 && scorer1 > scorel1
+ pandn mm5, mm1 ;scorel1 > scorec0 && scorel1 >= scorer1
+
+ pand mm4, mm3 ;mask_r2
+ pand mm2, mm5 ;mask_l2
+
+ pxor mm3, mm4 ;mask_r1
+ pxor mm5, mm2 ;mask_l1
+
+ pand mm0, [ecx+058h]
+ pand mm5, [ecx+068h]
+ pand mm2, [ecx+078h]
+ pand mm3, [ecx+088h]
+ pand mm4, [ecx+098h]
+ por mm0, mm5
+ por mm2, mm3
+ por mm0, mm4
+ por mm0, mm2
+ movq [edx+8], mm0
+
+ add ecx, 0a0h
+ add edx, 16
+
+ dec eax
+ jnz xloop
+
+ emms
+ ret 12
+ }
+ }
#endif
namespace {
+ void ela_L8_scalar(uint8 *dst, const uint8 *srcat, const uint8 *srcab, int w16) {
+ int w = w16 << 4;
+
+ srcat += 16;
+ srcab += 16;
+ do {
+ int topl2 = srcat[-3];
+ int topl1 = srcat[-2];
+ int topc0 = srcat[-1];
+ int topr1 = srcat[0];
+ int topr2 = srcat[1];
+ int topr3 = srcat[2];
+
+ int botl2 = srcab[-3];
+ int botl1 = srcab[-2];
+ int botc0 = srcab[-1];
+ int botr1 = srcab[0];
+ int botr2 = srcab[1];
+ int botr3 = srcab[2];
+ ++srcat;
+ ++srcab;
+
+ int rawscorec0 = abs(topc0 - botc0);
+ int rawscorel1 = abs(topl1 - botr1);
+ int rawscorel2 = abs(topl2 - botr2);
+ int rawscorer1 = abs(topr1 - botl1);
+ int rawscorer2 = abs(topr2 - botl2);
+
+ dst[0] = (uint8)rawscorec0;
+ dst[1] = (uint8)rawscorel1;
+ dst[2] = (uint8)rawscorel2;
+ dst[3] = (uint8)rawscorer1;
+ dst[4] = (uint8)rawscorer2;
+ dst[5] = (uint8)((topr1 + botr1 + 1) >> 1);
+ dst[6] = (uint8)((topc0 + botr2 + 1) >> 1);
+ dst[7] = (uint8)((topl1 + botr3 + 1) >> 1);
+ dst[8] = (uint8)((topr2 + botc0 + 1) >> 1);
+ dst[9] = (uint8)((topr3 + botl1 + 1) >> 1);
+ dst += 10;
+ } while(--w);
+ }
+
+ void nela_L8_scalar(uint8 *dst, const uint8 *elabuf, int w16) {
+ int w = w16 << 4;
+
+ do {
+ int scorec0 = elabuf[10]*2 + (elabuf[0] + elabuf[20]);
+ int result = elabuf[5];
+
+ int scorel1 = elabuf[11]*2 + (elabuf[1] + elabuf[21]);
+ if (scorel1 < scorec0) {
+ result = elabuf[6];
+ scorec0 = scorel1;
+
+ int scorel2 = elabuf[12]*2 + (elabuf[2] + elabuf[22]);
+ if (scorel2 < scorec0) {
+ result = elabuf[7];
+ scorec0 = scorel2;
+ }
+ }
+
+ int scorer1 = elabuf[13]*2 + (elabuf[3] + elabuf[23]);
+ if (scorer1 < scorec0) {
+ result = elabuf[8];
+ scorec0 = scorer1;
+
+ int scorer2 = elabuf[14]*2 + (elabuf[4] + elabuf[24]);
+ if (scorer2 < scorec0)
+ result = elabuf[9];
+ }
+
+ elabuf += 10;
+
+ *dst++ = (uint8)result;
+ } while(--w);
+ }
+
+ void BlendScanLine_NELA_scalar(void *dst, const void *srcT, const void *srcB, uint32 w, uint8 *tempBuf) {
+ const uint8 *srcat = (const uint8 *)srcT;
+ const uint8 *srcab = (const uint8 *)srcB;
+ uint32 w16 = (w + 15) >> 4;
+ uint32 wr = w16 << 4;
+
+ uint8 *elabuf = tempBuf;
+ uint8 *topbuf = elabuf + 10*wr;
+ uint8 *botbuf = topbuf + wr + 32;
+
+ uint32 woffset = w & 15;
+ topbuf[13] = topbuf[14] = topbuf[15] = srcat[0];
+ botbuf[13] = botbuf[14] = botbuf[15] = srcab[0];
+
+ for(uint32 x=0; x<wr; ++x) {
+ topbuf[x+16] = srcat[x];
+ botbuf[x+16] = srcab[x];
+ }
+
+ if (woffset) {
+ uint8 *topfinal = &topbuf[w+16];
+ uint8 *botfinal = &botbuf[w+16];
+ const uint8 tv = topfinal[-1];
+ const uint8 bv = botfinal[-1];
+
+ for(uint32 i = woffset; i < 16; ++i) {
+ *topfinal++ = tv;
+ *botfinal++ = bv;
+ }
+ }
+
+ topbuf[wr+16] = topbuf[wr+17] = topbuf[wr+18] = topbuf[wr+15];
+ topbuf[wr+16] = topbuf[wr+17] = botbuf[wr+18] = botbuf[wr+15];
+
+ ela_L8_scalar(elabuf, topbuf, botbuf, w16);
+ nela_L8_scalar((uint8 *)dst, elabuf, w16);
+ }
+
+ void ela_X8R8G8B8_scalar(uint32 *dst, const uint8 *srcat, const uint8 *srcab, int w4) {
+ srcat += 4;
+ srcab += 4;
+ do {
+ const uint8 *src1 = srcat;
+ const uint8 *src2 = srcab + 16;
+
+ for(int i=0; i<5; ++i) {
+ int er = abs((int)src1[2] - (int)src2[2]);
+ int eg = abs((int)src1[1] - (int)src2[1]);
+ int eb = abs((int)src1[0] - (int)src2[0]);
+ *dst++ = er*54 + eg*183 + eb*19;
+ src1 += 4;
+ src2 -= 4;
+ }
+
+ srcat += 4;
+ srcab += 4;
+ } while(--w4);
+ }
+
+#if defined(VD_CPU_X86)
+ void __declspec(naked) __cdecl ela_X8R8G8B8_MMX(uint32 *dst, const uint8 *srcat, const uint8 *srcab, int w4) {
+ static const __declspec(align(16)) uint64 kCoeff = 0x00003600b70013ull;
+
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebx, [esp+4+16]
+ mov ecx, [esp+8+16]
+ mov edx, [esp+12+16]
+ add ecx, 4
+ add edx, 4
+ mov esi, [esp+16+16]
+ movq mm6, qword ptr [kCoeff]
+ pxor mm7, mm7
+
+ align 16
+xloop:
+ movd mm0, [ecx]
+ movd mm2, [edx + 16]
+ movq mm1, mm0
+ psubusb mm0, mm2
+ psubusb mm2, mm1
+ por mm0, mm2
+ punpcklbw mm0, mm7
+ pmaddwd mm0, mm6
+ movq mm1, mm0
+ psrlq mm0, 32
+ paddd mm0, mm1
+ movd [ebx], mm0
+
+ movd mm0, [ecx + 4]
+ movd mm2, [edx + 12]
+ movq mm1, mm0
+ psubusb mm0, mm2
+ psubusb mm2, mm1
+ por mm0, mm2
+ punpcklbw mm0, mm7
+ pmaddwd mm0, mm6
+ movq mm1, mm0
+ psrlq mm0, 32
+ paddd mm0, mm1
+ movd [ebx + 4], mm0
+
+ movd mm0, [ecx + 8]
+ movd mm2, [edx + 8]
+ movq mm1, mm0
+ psubusb mm0, mm2
+ psubusb mm2, mm1
+ por mm0, mm2
+ punpcklbw mm0, mm7
+ pmaddwd mm0, mm6
+ movq mm1, mm0
+ psrlq mm0, 32
+ paddd mm0, mm1
+ movd [ebx + 8], mm0
+
+ movd mm0, [ecx + 12]
+ movd mm2, [edx + 4]
+ movq mm1, mm0
+ psubusb mm0, mm2
+ psubusb mm2, mm1
+ por mm0, mm2
+ punpcklbw mm0, mm7
+ pmaddwd mm0, mm6
+ movq mm1, mm0
+ psrlq mm0, 32
+ paddd mm0, mm1
+ movd [ebx + 12], mm0
+
+ movd mm0, [ecx + 16]
+ movd mm2, [edx]
+ movq mm1, mm0
+ psubusb mm0, mm2
+ psubusb mm2, mm1
+ por mm0, mm2
+ punpcklbw mm0, mm7
+ pmaddwd mm0, mm6
+ movq mm1, mm0
+ psrlq mm0, 32
+ paddd mm0, mm1
+ movd [ebx + 16], mm0
+
+ add ebx, 20
+ add ecx, 4
+ add edx, 4
+ dec esi
+ jne xloop
+
+ emms
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+ }
+#endif
+
+ void nela_X8R8G8B8_scalar(uint32 *dst, const uint32 *elabuf, const uint8 *srca, const uint8 *srcb, int w4) {
+ do {
+ int scorec0 = elabuf[7]*2 + (elabuf[2] + elabuf[12]);
+ int offset = 0;
+
+ int scorel1 = elabuf[6]*2 + (elabuf[1] + elabuf[11]);
+ if (scorel1 < scorec0) {
+ offset = -4;
+ scorec0 = scorel1;
+
+ int scorel2 = elabuf[5]*2 + (elabuf[0] + elabuf[10]);
+ if (scorel2 < scorec0) {
+ offset = -8;
+ scorec0 = scorel2;
+ }
+ }
+
+ int scorer1 = elabuf[8]*2 + (elabuf[3] + elabuf[13]);
+ if (scorer1 < scorec0) {
+ offset = 4;
+ scorec0 = scorer1;
+
+ int scorer2 = elabuf[9]*2 + (elabuf[4] + elabuf[14]);
+ if (scorer2 < scorec0)
+ offset = 8;
+ }
+
+ elabuf += 5;
+
+ const uint32 a = *(const uint32 *)(srca + offset);
+ const uint32 b = *(const uint32 *)(srcb - offset);
+ *dst++ = (a|b) - (((a^b) & 0xfefefefe) >> 1);
+ srca += 4;
+ srcb += 4;
+ } while(--w4);
+ }
+
+ void BlendScanLine_NELA_X8R8G8B8_scalar(void *dst, const void *srcT, const void *srcB, uint32 w, void *tempBuf) {
+ const uint32 *srcat = (const uint32 *)srcT;
+ const uint32 *srcab = (const uint32 *)srcB;
+ uint32 w4 = (w + 3) >> 2;
+ uint32 *elabuf = (uint32 *)tempBuf;
+ uint32 *topbuf = elabuf + 5*w4;
+ uint32 *botbuf = topbuf + w4 + 8;
+
+ topbuf[0] = topbuf[1] = topbuf[2] = topbuf[3] = srcat[0];
+ botbuf[0] = botbuf[1] = botbuf[2] = botbuf[3] = srcab[0];
+
+ for(uint32 x=0; x<w4; ++x) {
+ topbuf[x+4] = srcat[x];
+ botbuf[x+4] = srcab[x];
+ }
+
+ topbuf[w4+4] = topbuf[w4+5] = topbuf[w4+6] = topbuf[w4+7] = topbuf[w4+3];
+ botbuf[w4+4] = botbuf[w4+5] = botbuf[w4+6] = botbuf[w4+7] = botbuf[w4+3];
+
+ ela_X8R8G8B8_scalar(elabuf, (const uint8 *)topbuf, (const uint8 *)botbuf, w4);
+ nela_X8R8G8B8_scalar((uint32 *)dst, elabuf, (const uint8 *)(topbuf + 4), (const uint8 *)(botbuf + 4), w4);
+ }
+
+#if defined(VD_CPU_X86)
+ void BlendScanLine_NELA_X8R8G8B8_MMX(void *dst, const void *srcT, const void *srcB, uint32 w, void *tempBuf) {
+ const uint32 *srcat = (const uint32 *)srcT;
+ const uint32 *srcab = (const uint32 *)srcB;
+ uint32 w4 = (w + 3) >> 2;
+ uint32 *elabuf = (uint32 *)tempBuf;
+ uint32 *topbuf = elabuf + 5*w4;
+ uint32 *botbuf = topbuf + w4 + 8;
+
+ topbuf[0] = topbuf[1] = topbuf[2] = topbuf[3] = srcat[0];
+ botbuf[0] = botbuf[1] = botbuf[2] = botbuf[3] = srcab[0];
+
+ for(uint32 x=0; x<w4; ++x) {
+ topbuf[x+4] = srcat[x];
+ botbuf[x+4] = srcab[x];
+ }
+
+ topbuf[w4+4] = topbuf[w4+5] = topbuf[w4+6] = topbuf[w4+7] = topbuf[w4+3];
+ botbuf[w4+4] = botbuf[w4+5] = botbuf[w4+6] = botbuf[w4+7] = botbuf[w4+3];
+
+ ela_X8R8G8B8_MMX(elabuf, (const uint8 *)topbuf, (const uint8 *)botbuf, w4);
+ nela_X8R8G8B8_scalar((uint32 *)dst, elabuf, (const uint8 *)(topbuf + 4), (const uint8 *)(botbuf + 4), w4);
+ }
+
+ void BlendScanLine_NELA_MMX_ISSE(void *dst, const void *srcT, const void *srcB, uint32 w, void *tempBuf) {
+ const uint32 *srcat = (const uint32 *)srcT;
+ const uint32 *srcab = (const uint32 *)srcB;
+ uint32 w16 = (w + 15) >> 4;
+ uint32 w4 = w16 * 4;
+ uint32 *elabuf = (uint32 *)tempBuf;
+ uint32 *topbuf = elabuf + 40*w16;
+ uint32 *botbuf = topbuf + w4 + 8;
+
+ uint32 woffset = w & 15;
+ topbuf[0] = topbuf[1] = topbuf[2] = topbuf[3] = (srcat[0] & 0xff) * 0x01010101;
+ botbuf[0] = botbuf[1] = botbuf[2] = botbuf[3] = (srcab[0] & 0xff) * 0x01010101;
+
+ for(uint32 x=0; x<w4; ++x) {
+ topbuf[x+4] = srcat[x];
+ botbuf[x+4] = srcab[x];
+ }
+
+ if (woffset) {
+ uint8 *topfinal = (uint8 *)&topbuf[w4] + woffset;
+ uint8 *botfinal = (uint8 *)&botbuf[w4] + woffset;
+ const uint8 tv = topfinal[-1];
+ const uint8 bv = botfinal[-1];
+
+ for(uint32 i = woffset; i < 16; ++i) {
+ *topfinal++ = tv;
+ *botfinal++ = bv;
+ }
+ }
+
+ topbuf[w4+4] = topbuf[w4+5] = topbuf[w4+6] = topbuf[w4+7] = topbuf[w4+3];
+ botbuf[w4+4] = topbuf[w4+5] = topbuf[w4+6] = topbuf[w4+7] = botbuf[w4+3];
+
+ if (ISSE_enabled) {
+ ela_L8_ISSE(elabuf, topbuf, botbuf, w16);
+ nela_L8_ISSE(dst, elabuf, w16);
+ } else {
+ ela_L8_MMX(elabuf, topbuf, botbuf, w16);
+ nela_L8_MMX(dst, elabuf, w16);
+ }
+ }
+#endif
+
+ void BlendScanLine_NELA_SSE2(void *dst, const void *srcT, const void *srcB, uint32 w, __m128i *tempBuf) {
+ const __m128i *srcat = (const __m128i *)srcT;
+ const __m128i *srcab = (const __m128i *)srcB;
+ uint32 w16 = (w + 15) >> 4;
+ __m128i *elabuf = tempBuf;
+ __m128i *topbuf = elabuf + 10*w16;
+ __m128i *botbuf = topbuf + w16 + 2;
+
+ uint32 woffset = w & 15;
+ topbuf[0] = srcat[0];
+ botbuf[0] = srcab[0];
+
+ for(uint32 x=0; x<w16; ++x) {
+ topbuf[x+1] = srcat[x];
+ botbuf[x+1] = srcab[x];
+ }
+
+ if (woffset) {
+ uint8 *topfinal = (uint8 *)&topbuf[w16] + woffset;
+ uint8 *botfinal = (uint8 *)&botbuf[w16] + woffset;
+ const uint8 tv = topfinal[-1];
+ const uint8 bv = botfinal[-1];
+
+ for(uint32 i = woffset; i < 16; ++i) {
+ *topfinal++ = tv;
+ *botfinal++ = bv;
+ }
+ }
+
+ topbuf[w16+1] = topbuf[w16];
+ botbuf[w16+1] = botbuf[w16];
+
+ ela_L8_SSE2(elabuf, topbuf, botbuf, w16);
+ nela_L8_SSE2((__m128i *)dst, elabuf, w16);
+ }
+
+ void InterpPlane_NELA_X8R8G8B8(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, uint32 w, uint32 h, bool interpField2) {
+ uint32 w16 = (w + 15) >> 4;
+ vdfastvector<uint8, vdaligned_alloc<uint8> > tempbuf((12 * w16 + 4) * 16);
+ void *elabuf = tempbuf.data();
+
+ if (!interpField2)
+ memcpy(dst, src, w16 << 4);
+
+ int y0 = interpField2 ? 1 : 2;
+ for(uint32 y = y0; y < h - 1; y += 2) {
+ const __m128i *srcat = (const __m128i *)((const char *)src + srcpitch * (y-1));
+ const __m128i *srcab = (const __m128i *)((const char *)src + srcpitch * (y+1));
+
+#if defined(VD_CPU_X86)
+ if (MMX_enabled)
+ BlendScanLine_NELA_X8R8G8B8_MMX((char *)dst + dstpitch*y, srcat, srcab, w, (uint8 *)elabuf);
+ else
+#endif
+ BlendScanLine_NELA_X8R8G8B8_scalar((char *)dst + dstpitch*y, srcat, srcab, w, (uint8 *)elabuf);
+ }
+
+ if (interpField2)
+ memcpy((char *)dst + dstpitch*(h - 1), (const char *)src + srcpitch*(h - 1), w16 << 4);
+ }
+
+ void InterpPlane_NELA(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, uint32 w, uint32 h, bool interpField2) {
+ uint32 w16 = (w + 15) >> 4;
+ vdfastvector<uint8, vdaligned_alloc<uint8> > tempbuf((12 * w16 + 4) * 16);
+ void *elabuf = tempbuf.data();
+
+ if (!interpField2)
+ memcpy(dst, src, w16 << 4);
+
+ int y0 = interpField2 ? 1 : 2;
+ if (SSE2_enabled) {
+ for(uint32 y = y0; y < h - 1; y += 2) {
+ const __m128i *srcat = (const __m128i *)((const char *)src + srcpitch * (y-1));
+ const __m128i *srcab = (const __m128i *)((const char *)src + srcpitch * (y+1));
+
+ BlendScanLine_NELA_SSE2((char *)dst + dstpitch*y, srcat, srcab, w, (__m128i *)elabuf);
+ }
+ }
+#if defined(VD_CPU_X86)
+ else if (MMX_enabled || ISSE_enabled) {
+ for(uint32 y = y0; y < h - 1; y += 2) {
+ const __m128i *srcat = (const __m128i *)((const char *)src + srcpitch * (y-1));
+ const __m128i *srcab = (const __m128i *)((const char *)src + srcpitch * (y+1));
+
+ BlendScanLine_NELA_MMX_ISSE((char *)dst + dstpitch*y, srcat, srcab, w, (uint8 *)elabuf);
+ }
+ }
+#endif
+ else {
+ for(uint32 y = y0; y < h - 1; y += 2) {
+ const __m128i *srcat = (const __m128i *)((const char *)src + srcpitch * (y-1));
+ const __m128i *srcab = (const __m128i *)((const char *)src + srcpitch * (y+1));
+
+ BlendScanLine_NELA_scalar((char *)dst + dstpitch*y, srcat, srcab, w, (uint8 *)elabuf);
+ }
+ }
+
+ if (interpField2)
+ memcpy((char *)dst + dstpitch*(h - 1), (const char *)src + srcpitch*(h - 1), w16 << 4);
+ }
void Average_scalar(void *dst, ptrdiff_t dstPitch, const void *src1, const void *src2, ptrdiff_t srcPitch, uint32 w16, uint32 h) {
uint32 w4 = w16 << 2;
@@ -513,9 +1941,18 @@ mainRowLoop:
}
}
-void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch)
+void DeinterlaceELA_X8R8G8B8(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
{
- BlendPlane(dst, dstpitch, src, srcpitch, w, h);
+ topfield = !topfield;
+
+ InterpPlane_NELA_X8R8G8B8(dst, dstpitch, src, srcpitch, w, h, topfield);
+}
+
+void DeinterlaceELA(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
+{
+ topfield = !topfield;
+
+ InterpPlane_NELA(dst, dstpitch, src, srcpitch, w, h, topfield);
}
void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
@@ -524,3 +1961,8 @@ void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWOR
InterpPlane_Bob(dst, dstpitch, src, srcpitch, w, h, topfield);
}
+
+void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch)
+{
+ BlendPlane(dst, dstpitch, src, srcpitch, w, h);
+}
diff --git a/src/DSUtil/vd.cpp b/src/DSUtil/vd.cpp
index 0d7f77aaa..2d6c11c98 100644
--- a/src/DSUtil/vd.cpp
+++ b/src/DSUtil/vd.cpp
@@ -310,10 +310,6 @@ bool BitBltFromI420ToYUY2Interlaced(int w, int h, BYTE* dst, int dstpitch, BYTE*
yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch);
return(true);
}
- else
- {
- ASSERT(FALSE);
- }
if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
{
diff --git a/src/DSUtil/vd.h b/src/DSUtil/vd.h
index 0db586cec..ae167a841 100644
--- a/src/DSUtil/vd.h
+++ b/src/DSUtil/vd.h
@@ -35,4 +35,6 @@ extern bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp,
extern bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp);
extern void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch);
-extern void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield); \ No newline at end of file
+extern void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield);
+extern void DeinterlaceELA_X8R8G8B8(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield);
+extern void DeinterlaceELA(BYTE* dst, BYTE* src, DWORD w, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield);