//	VirtualDub - Video processing and capture application
//	Copyright (C) 1998-2001 Avery Lee
//
//	This program is free software; you can redistribute it and/or modify
//	it under the terms of the GNU General Public License as published by
//	the Free Software Foundation; either version 2 of the License, or
//	(at your option) any later version.
//
//	This program is distributed in the hope that it will be useful,
//	but WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//	GNU General Public License for more details.
//
//	You should have received a copy of the GNU General Public License
//	along with this program; if not, write to the Free Software
//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

#include "stdafx.h"


void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
{
	if(h <= 1)
		return;

	BYTE* s = dst;
	BYTE* d = dst + (h-2)*pitch;

	for(; s < d; s += pitch*2)
	{
		BYTE* tmp = s;

#ifndef _WIN64
		if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf))
		{
			__asm
			{
				mov		esi, tmp
				mov		ebx, pitch

				mov		ecx, ebx
				shr		ecx, 4

AvgLines8_sse2_loop:
				movdqa	xmm0, [esi]
				pavgb	xmm0, [esi+ebx*2]
				movdqa	[esi+ebx], xmm0
				add		esi, 16

				dec		ecx
				jnz		AvgLines8_sse2_loop

				mov		tmp, esi
			}

			for(ptrdiff_t i = pitch&7; i--; tmp++)
			{
				tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
			}
		}
		else if(g_cpuid.m_flags & CCpuID::mmx)
		{
			__asm
			{
				mov		esi, tmp
				mov		ebx, pitch

				mov		ecx, ebx
				shr		ecx, 3

				pxor	mm7, mm7
AvgLines8_mmx_loop:
				movq	mm0, [esi]
				movq	mm1, mm0

				punpcklbw	mm0, mm7
				punpckhbw	mm1, mm7

				movq	mm2, [esi+ebx*2]
				movq	mm3, mm2

				punpcklbw	mm2, mm7
				punpckhbw	mm3, mm7

				paddw	mm0, mm2
				psrlw	mm0, 1

				paddw	mm1, mm3
				psrlw	mm1, 1

				packuswb	mm0, mm1

				movq	[esi+ebx], mm0

				lea		esi, [esi+8]

				dec		ecx
				jnz		AvgLines8_mmx_loop

				mov		tmp, esi
			}

			for(ptrdiff_t i = pitch&7; i--; tmp++)
			{
				tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
			}
		}
		else
#endif
		{
			for(ptrdiff_t i = pitch; i--; tmp++)
			{
				tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
			}
		}
	}

	if(!(h&1) && h >= 2)
	{
		dst += (h-2)*pitch;
		memcpy(dst + pitch, dst, pitch);
	}

#ifndef _WIN64
	__asm emms;
#endif
}

void AvgLines555(BYTE* dst, DWORD h, DWORD pitch)
{
	if(h <= 1)
		return;

	unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0;
	unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;

	BYTE* s = dst;
	BYTE* d = dst + (h-2)*pitch;

	for(; s < d; s += pitch*2)
	{
		BYTE* tmp = s;

#ifndef _WIN64
		__asm
		{
			mov		esi, tmp
			mov		ebx, pitch

			mov		ecx, ebx
			shr		ecx, 3

			movq	mm6, __0x03e003e003e003e0
			movq	mm7, __0x001f001f001f001f

AvgLines555_loop:
			movq	mm0, [esi]
			movq	mm1, mm0
			movq	mm2, mm0

			psrlw	mm0, 10				// red1 bits: mm0 = 001f001f001f001f
			pand	mm1, mm6			// green1 bits: mm1 = 03e003e003e003e0
			pand	mm2, mm7			// blue1 bits: mm2 = 001f001f001f001f

			movq	mm3, [esi+ebx*2]
			movq	mm4, mm3
			movq	mm5, mm3

			psrlw	mm3, 10				// red2 bits: mm3 = 001f001f001f001f
			pand	mm4, mm6			// green2 bits: mm4 = 03e003e003e003e0
			pand	mm5, mm7			// blue2 bits: mm5 = 001f001f001f001f

			paddw	mm0, mm3
			psrlw	mm0, 1				// (red1+red2)/2
			psllw	mm0, 10				// red bits at 7c007c007c007c00

			paddw	mm1, mm4
			psrlw	mm1, 1				// (green1+green2)/2
			pand	mm1, mm6			// green bits at 03e003e003e003e0

			paddw	mm2, mm5
			psrlw	mm2, 1				// (blue1+blue2)/2
			// blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)

			por		mm0, mm1
			por		mm0, mm2

			movq	[esi+ebx], mm0

			lea		esi, [esi+8]

			dec		ecx
			jnz		AvgLines555_loop

			mov		tmp, esi
		}
#endif

		for(ptrdiff_t i = (pitch&7)>>1; i--; tmp++)
		{
			tmp[pitch] =
				((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)|
				((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)|
				((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
		}
	}

	if(!(h&1) && h >= 2)
	{
		dst += (h-2)*pitch;
		memcpy(dst + pitch, dst, pitch);
	}

#ifndef _WIN64
	__asm emms;
#endif
}

void AvgLines565(BYTE* dst, DWORD h, DWORD pitch)
{
	if(h <= 1)
		return;

	unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0;
	unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;

	BYTE* s = dst;
	BYTE* d = dst + (h-2)*pitch;

	for(; s < d; s += pitch*2)
	{
		WORD* tmp = (WORD*)s;

#ifndef _WIN64
		__asm
		{
			mov		esi, tmp
			mov		ebx, pitch

			mov		ecx, ebx
			shr		ecx, 3

			movq	mm6, __0x07e007e007e007e0
			movq	mm7, __0x001f001f001f001f

AvgLines565_loop:
			movq	mm0, [esi]
			movq	mm1, mm0
			movq	mm2, mm0

			psrlw	mm0, 11				// red1 bits: mm0 = 001f001f001f001f
			pand	mm1, mm6			// green1 bits: mm1 = 07e007e007e007e0
			pand	mm2, mm7			// blue1 bits: mm2 = 001f001f001f001f

			movq	mm3, [esi+ebx*2]
			movq	mm4, mm3
			movq	mm5, mm3

			psrlw	mm3, 11				// red2 bits: mm3 = 001f001f001f001f
			pand	mm4, mm6			// green2 bits: mm4 = 07e007e007e007e0
			pand	mm5, mm7			// blue2 bits: mm5 = 001f001f001f001f

			paddw	mm0, mm3
			psrlw	mm0, 1				// (red1+red2)/2
			psllw	mm0, 11				// red bits at f800f800f800f800

			paddw	mm1, mm4
			psrlw	mm1, 1				// (green1+green2)/2
			pand	mm1, mm6			// green bits at 03e003e003e003e0

			paddw	mm2, mm5
			psrlw	mm2, 1				// (blue1+blue2)/2
			// blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)

			por		mm0, mm1
			por		mm0, mm2

			movq	[esi+ebx], mm0

			lea		esi, [esi+8]

			dec		ecx
			jnz		AvgLines565_loop

			mov		tmp, esi
		}
#else
		for(ptrdiff_t wd=(pitch>>3); wd--; tmp++)
		{
			tmp[0] =
				((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
				((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
				((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
		}
#endif

		for(ptrdiff_t i = (pitch&7)>>1; i--; tmp++)
		{
			tmp[pitch] =
				((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
				((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
				((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
		}
	}

	if(!(h&1) && h >= 2)
	{
		dst += (h-2)*pitch;
		memcpy(dst + pitch, dst, pitch);
	}

#ifndef _WIN64
	__asm emms;
#endif
}