DSUtil now use new VirtualDub libraries (SSE2 deinterlacing for MPEG2 decoder)

AudioSwitcher rare memory corruption fixed git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1907 10f7b99b-c216-0410-bff0-8a66a9350fd8
author: kinddragon <kinddragon@users.sourceforge.net> 2010-05-21 04:53:52 +0400
committer: kinddragon <kinddragon@users.sourceforge.net> 2010-05-21 04:53:52 +0400
commit: 37f62abd654047d060c86d6c76cd2f6862f89b94 (patch)
tree: 83eb125bd86f8a685928e290e2ec929ce633bc53 /src/thirdparty/VirtualDub/Kasumi/source
parent: dae6425e0c23576dac77c3afae1dc6de22f983d5 (diff)
49 files changed, 22667 insertions, 0 deletions
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64 b/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64
new file mode 100644
index 000000000..e6de1eabf
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64
@@ -0,0 +1,620 @@
+;	VirtualDub - Video processing and capture application
+;	Graphics support library
+;	Copyright (C) 1998-2004 Avery Lee
+;
+;	This program is free software; you can redistribute it and/or modify
+;	it under the terms of the GNU General Public License as published by
+;	the Free Software Foundation; either version 2 of the License, or
+;	(at your option) any later version.
+;
+;	This program is distributed in the hope that it will be useful,
+;	but WITHOUT ANY WARRANTY; without even the implied warranty of
+;	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;	GNU General Public License for more details.
+;
+;	You should have received a copy of the GNU General Public License	
+;	along with this program; if not, write to the Free Software
+;	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+;
+
+	default	rel
+
+	segment	.rdata, align=16
+
+	align 16
+roundval		dq	0000200000002000h, 0000200000002000h
+
+
+	segment	.text
+
+
+%macro VDSAVE 1-*
+
+	%rep	%0
+		%rotate -1
+		push		%1
+		[pushreg	%1]
+	%endrep
+
+%endmacro
+
+%macro VDRESTORE 1-*
+
+	%rep	%0
+		pop			%1
+
+		%rotate 1
+	%endrep
+
+%endmacro
+
+%macro VDSAVEXMM128	2
+%assign	%%count		%2 + 1 - %1
+%assign %%stkoffset	0
+%assign %%reg		%1
+
+	sub rsp, %%count*16+8
+	[allocstack %%count*16]
+
+	%rep %%count
+		movdqa	oword [rsp+%%stkoffset], xmm %+ %%reg
+		[savexmm128 xmm %+ %%reg, %%stkoffset]
+
+		%assign	%%stkoffset	%%stkoffset + 16
+		%assign %%reg		%%reg + 1
+	%endrep
+%endmacro
+
+%macro VDRESTOREXMM128	2
+%assign	%%count		%2+1-%1
+%assign	%%stkoffset	%%count*16
+%assign	%%reg		%2
+
+	%rep	%%count
+		%assign %%stkoffset %%stkoffset-16
+		movdqa xmm %+ %%reg, oword [rsp+%%stkoffset]
+
+		%assign %%reg %%reg-1
+	%endrep
+
+	add rsp, %%count*16+8
+%endmacro
+
+;-------------------------------------------------------------------------
+;
+;	long vdasm_resize_table_row_SSE2(
+;		Pixel *out,			// rcx
+;		Pixel *in,			// rdx
+;		int *filter,		// r8
+;		int filter_width,	// r9d
+;		PixDim w,			// [rsp+40]
+;		long accum,			// [rsp+48]
+;		long frac);			// [rsp+56]
+;
+	global vdasm_resize_table_row_SSE2
+proc_frame vdasm_resize_table_row_SSE2
+
+	VDSAVE			rbx, rsi, rdi, rbp, r12, r13, r14, r15
+	VDSAVEXMM128	6, 15
+end_prolog
+
+	.parms equ rsp+168+64
+
+	mov			r10d, dword [.parms+40]
+	shl			r10, 2
+	add			rcx, r10
+	neg			r10
+	shl			r9d, 2					;filter_width <<= 2
+
+	movaps		xmm6, oword [roundval]
+	pxor		xmm5, xmm5
+	mov			rsi, rdx
+	shr			rsi, 2
+
+	mov			edi, [.parms+48]
+	mov			eax, edi
+	shl			edi, 16
+	sar			rax, 16
+	add			rsi, rax
+	mov			ebp, [.parms+56]
+	movsxd		r11, ebp
+	shl			ebp, 16
+	sar			r11, 16
+
+	;register map
+	;
+	;eax		temp coefficient pair counter
+	;rbx		temp coefficient pointer
+	;rcx		destination
+	;rdx		temp source
+	;rsi		source/4
+	;edi		accumulator
+	;ebp		fractional increment
+	;r8			filter
+	;r9			filter_width*4
+	;r10		-width*4
+	;r11		integral increment
+	;r12
+	;r13
+	;r14
+	;r15
+
+	cmp			r9d, 16
+	jz			.accel_4coeff
+	cmp			r9d, 24
+	jz			.accel_6coeff
+
+	test		r9d, 8
+	jz			.pixelloop_even_pairs
+	cmp			r9d, 8
+	jnz			.pixelloop_odd_pairs
+
+.pixelloop_single_pairs:
+	mov			eax, edi
+	shr			eax, 24
+	imul		eax, r9d
+	
+	lea			rdx, [rsi*4]
+
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm0, xmm5
+	movq		xmm1, qword [r8+rax]
+	pshufd		xmm1, xmm1, 01000100b
+	pmaddwd		xmm0, xmm1
+	
+	movdqa		xmm4, xmm6
+	paddd		xmm4, xmm0
+
+	psrad		xmm4, 14
+	packssdw	xmm4, xmm4
+	packuswb	xmm4, xmm4
+
+	add			edi, ebp
+	adc			rsi, r11
+
+	movd		dword [rcx+r10], xmm4
+	add			r10, 4
+	jnz			.pixelloop_single_pairs
+	jmp			.xit
+
+.pixelloop_odd_pairs:
+	movdqa		xmm4, xmm6
+
+	mov			eax, edi
+	shr			eax, 24
+	imul		eax, r9d
+	lea			rbx, [r8+rax]
+
+	lea			rdx, [rsi*4]
+	lea			rax, [r9-8]
+.coeffloop_odd_pairs:
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	movd		xmm2, dword [rdx+8]		;xmm2 = p2
+	movd		xmm3, dword [rdx+12]		;xmm3 = p3
+	add			rdx, 16
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+	movq		xmm1, qword [rbx]
+	movq		xmm3, qword [rbx+8]
+	add			rbx, 16
+	pshufd		xmm1, xmm1, 01000100b
+	pshufd		xmm3, xmm3, 01000100b
+	pmaddwd		xmm0, xmm1
+	pmaddwd		xmm2, xmm3
+	paddd		xmm0, xmm2
+	paddd		xmm4, xmm0
+	sub			eax, 16
+	jnz			.coeffloop_odd_pairs
+
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm0, xmm5
+	movq		xmm1, qword [rbx]
+	pshufd		xmm1, xmm1, 01000100b
+	pmaddwd		xmm0, xmm1
+	paddd		xmm4, xmm0
+
+	psrad		xmm4, 14
+	packssdw	xmm4, xmm4
+	packuswb	xmm4, xmm4
+
+	add			edi, ebp
+	adc			rsi, r11
+
+	movd		dword [rcx+r10], xmm4
+	add			r10, 4
+	jnz			.pixelloop_odd_pairs
+	jmp			.xit
+
+.pixelloop_even_pairs:
+	movdqa		xmm4, xmm6
+
+	mov			eax, edi
+	shr			eax, 24
+	imul		eax, r9d
+	lea			rbx, [r8+rax]
+
+	lea			rdx, [rsi*4]
+	mov			eax, r9d
+.coeffloop_even_pairs:
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	movd		xmm2, dword [rdx+8]		;xmm2 = p2
+	movd		xmm3, dword [rdx+12]		;xmm3 = p3
+	add			rdx, 16
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+	movq		xmm1, qword [rbx]
+	movq		xmm3, qword [rbx+8]
+	add			rbx, 16
+	pshufd		xmm1, xmm1, 01000100b
+	pshufd		xmm3, xmm3, 01000100b
+	pmaddwd		xmm0, xmm1
+	pmaddwd		xmm2, xmm3
+	paddd		xmm0, xmm2
+	paddd		xmm4, xmm0
+	sub			eax, 16
+	jnz			.coeffloop_even_pairs
+
+	psrad		xmm4, 14
+	packssdw	xmm4, xmm4
+	packuswb	xmm4, xmm4
+
+	add			edi, ebp
+	adc			rsi, r11
+
+	movd		dword [rcx+r10], xmm4
+	add			r10, 4
+	jnz			.pixelloop_even_pairs
+
+.xit:
+	VDRESTOREXMM128	6, 15
+	VDRESTORE	rbx, rsi, rdi, rbp, r12, r13, r14, r15
+	ret
+
+.accel_4coeff:
+.pixelloop_4coeff:
+	pxor		xmm5, xmm5
+	movdqa		xmm4, xmm6
+
+	mov			eax, 0ff000000h
+	lea			rdx, [rsi*4]
+	and			eax, edi
+	shr			eax, 20
+	lea			rbx, [r8+rax]
+
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	movd		xmm2, dword [rdx+8]		;xmm2 = p2
+	movd		xmm3, dword [rdx+12]		;xmm3 = p3
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+	movq		xmm1, qword [rbx]
+	movq		xmm3, qword [rbx+8]
+	pshufd		xmm1, xmm1, 01000100b
+	pshufd		xmm3, xmm3, 01000100b
+	pmaddwd		xmm0, xmm1
+	pmaddwd		xmm2, xmm3
+	paddd		xmm0, xmm2
+	paddd		xmm4, xmm0
+
+	psrad		xmm4, 14
+	packssdw	xmm4, xmm4
+	packuswb	xmm4, xmm4
+
+	add			edi, ebp
+	adc			rsi, r11
+
+	movd		dword [rcx+r10], xmm4
+	add			r10, 4
+	jnz			.pixelloop_4coeff
+	jmp			.xit
+
+.accel_6coeff:
+.pixelloop_6coeff:
+	pxor		xmm5, xmm5
+	movdqa		xmm4, xmm6
+
+	lea			rdx, [rsi*4]
+	mov			eax, edi
+	shr			eax, 24
+	lea			rax, [rax+rax*2]
+	lea			rbx, [r8+rax*8]
+
+	movd		xmm0, dword [rdx]			;xmm0 = p0
+	movd		xmm1, dword [rdx+4]		;xmm1 = p1
+	movd		xmm2, dword [rdx+8]		;xmm2 = p2
+	movd		xmm3, dword [rdx+12]		;xmm3 = p3
+	movd		xmm8, dword [rdx+16]		;xmm6 = p4
+	movd		xmm9, dword [rdx+20]		;xmm7 = p5
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm8, xmm9
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+	punpcklbw	xmm8, xmm5
+	movq		xmm1, qword [rbx]
+	movq		xmm3, qword [rbx+8]
+	movq		xmm9, qword [rbx+16]
+	pshufd		xmm1, xmm1, 01000100b
+	pshufd		xmm3, xmm3, 01000100b
+	pshufd		xmm9, xmm9, 01000100b
+	pmaddwd		xmm0, xmm1
+	pmaddwd		xmm2, xmm3
+	pmaddwd		xmm8, xmm9
+	paddd		xmm0, xmm2
+	paddd		xmm4, xmm0
+	paddd		xmm4, xmm8
+
+	psrad		xmm4, 14
+	packssdw	xmm4, xmm4
+	packuswb	xmm4, xmm4
+
+	add			edi, ebp
+	adc			rsi, r11
+
+	movd		dword [rcx+r10], xmm4
+	add			r10, 4
+	jnz			.pixelloop_6coeff
+	jmp			.xit
+endproc_frame
+
+
+;--------------------------------------------------------------------------
+;
+;	vdasm_resize_table_col_SSE2(
+;		uint32 *dst,				// rcx
+;		const uint32 *const *srcs,	// rdx
+;		int *filter,		// r8
+;		int filter_width,	// r9d
+;		PixDim w,			// [rsp+40] -> r10d
+;		);
+;
+	global	vdasm_resize_table_col_SSE2
+proc_frame	vdasm_resize_table_col_SSE2
+	VDSAVE			rbx, rsi, rdi, rbp, r12, r13, r14, r15
+	VDSAVEXMM128	6, 15
+end_prolog
+
+	.parms equ rsp+168+64
+
+	mov			r10d, [.parms+40]			;r10d = w
+
+	pxor		xmm5, xmm5
+	movdqa		xmm4, oword [roundval]
+	xor			rbx, rbx					;rbx = source offset
+
+	cmp			r9d, 4
+	jz			.accel_4coeff
+	cmp			r9d, 6
+	jz			.accel_6coeff
+
+	shr			r9d, 1						;r9d = filter pair count
+
+.pixelloop:
+	mov			rax, rdx					;rax = row pointer table
+	mov			rdi, r8						;rdi = filter
+	mov			r11d, r9d					;r11d = filter width counter
+	movdqa		xmm2, xmm4
+.coeffloop:
+	mov			rsi, [rax]
+
+	movd		xmm0, dword [rsi+rbx]
+
+	mov			rsi, [rax+8]
+	add			rax, 16
+
+	movd		xmm1, dword [rsi+rbx]
+	punpcklbw	xmm0, xmm1
+
+	punpcklbw	xmm0, xmm5
+
+	movq		xmm1, qword [rdi]
+	pshufd		xmm1, xmm1, 01000100b
+
+	pmaddwd		xmm0, xmm1
+
+	paddd		xmm2, xmm0
+
+	add			rdi,8
+
+	sub			r11d,1
+	jne			.coeffloop
+
+	psrad		xmm2,14
+	packssdw	xmm2,xmm2
+	add			rbx,4
+	packuswb	xmm2,xmm2
+
+	movd		dword [rcx],xmm2
+	add			rcx,4
+	sub			r10d,1
+	jne			.pixelloop
+
+.xit:
+	VDRESTOREXMM128	6, 15
+	VDRESTORE	rbx, rsi, rdi, rbp, r12, r13, r14, r15
+	ret
+
+.accel_4coeff:
+	mov			r12, [rdx]
+	mov			r13, [rdx+8]
+	mov			r14, [rdx+16]
+	mov			r15, [rdx+24]
+	movq		xmm8, qword [r8]
+	punpcklqdq	xmm8, xmm8
+	movq		xmm9, qword [r8+8]
+	punpcklqdq	xmm9, xmm9
+
+	sub			r10d, 1
+	jc			.oddpixel_4coeff
+.pixelloop_4coeff:
+	movq		xmm0, qword [r12+rbx]
+	movq		xmm1, qword [r13+rbx]
+	movq		xmm2, qword [r14+rbx]
+	movq		xmm3, qword [r15+rbx]
+
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+
+	movdqa		xmm1, xmm0
+	movdqa		xmm3, xmm2
+
+	punpcklbw	xmm0, xmm5
+	punpckhbw	xmm1, xmm5
+	punpcklbw	xmm2, xmm5
+	punpckhbw	xmm3, xmm5
+
+	pmaddwd		xmm0, xmm8
+	pmaddwd		xmm1, xmm8
+	pmaddwd		xmm2, xmm9
+	pmaddwd		xmm3, xmm9
+
+	paddd		xmm0, xmm4
+	paddd		xmm1, xmm4
+	paddd		xmm0, xmm2
+	paddd		xmm1, xmm3
+
+	psrad		xmm0, 14
+	psrad		xmm1, 14
+	packssdw	xmm0, xmm1
+	packuswb	xmm0, xmm0
+
+	movq		qword [rcx], xmm0
+	add			rcx, 8
+	add			rbx, 8
+	sub			r10d, 2
+	ja			.pixelloop_4coeff
+	jnz			.xit
+.oddpixel_4coeff:
+	movd		xmm0, dword [r12+rbx]
+	movd		xmm1, dword [r13+rbx]
+	movd		xmm2, dword [r14+rbx]
+	movd		xmm3, dword [r15+rbx]
+
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+
+	pmaddwd		xmm0, xmm8
+	pmaddwd		xmm2, xmm9
+
+	paddd		xmm0, xmm4
+	paddd		xmm0, xmm2
+
+	psrad		xmm0, 14
+	packssdw	xmm0, xmm0
+	packuswb	xmm0, xmm0
+
+	movd		dword [rcx], xmm0
+
+	jmp			.xit
+
+.accel_6coeff:
+	mov			r12, [rdx]
+	mov			r13, [rdx+8]
+	mov			r14, [rdx+16]
+	mov			r15, [rdx+24]
+	mov			rsi, [rdx+32]
+	mov			rdx, [rdx+40]
+	movq		xmm10, qword [r8]
+	punpcklqdq	xmm10, xmm10
+	movq		xmm11, qword [r8+8]
+	punpcklqdq	xmm11, xmm11
+	movq		xmm12, qword [r8+16]
+	punpcklqdq	xmm12, xmm12
+
+	sub			r10d, 1
+	jc			.oddpixel_6coeff
+.pixelloop_6coeff:
+	movq		xmm0, qword [r12+rbx]
+	movq		xmm1, qword [r13+rbx]
+	movq		xmm2, qword [r14+rbx]
+	movq		xmm3, qword [r15+rbx]
+	movq		xmm8, qword [rsi+rbx]
+	movq		xmm9, qword [rdx+rbx]
+
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm8, xmm9
+
+	movdqa		xmm1, xmm0
+	movdqa		xmm3, xmm2
+	movdqa		xmm9, xmm8
+
+	punpcklbw	xmm0, xmm5
+	punpckhbw	xmm1, xmm5
+	punpcklbw	xmm2, xmm5
+	punpckhbw	xmm3, xmm5
+	punpcklbw	xmm8, xmm5
+	punpckhbw	xmm9, xmm5
+
+	pmaddwd		xmm0, xmm10
+	pmaddwd		xmm1, xmm10
+	pmaddwd		xmm2, xmm11
+	pmaddwd		xmm3, xmm11
+	pmaddwd		xmm8, xmm12
+	pmaddwd		xmm9, xmm12
+
+	paddd		xmm0, xmm4
+	paddd		xmm1, xmm4
+	paddd		xmm2, xmm8
+	paddd		xmm3, xmm9
+	paddd		xmm0, xmm2
+	paddd		xmm1, xmm3
+
+	psrad		xmm0, 14
+	psrad		xmm1, 14
+	packssdw	xmm0, xmm1
+	packuswb	xmm0, xmm0
+
+	movq		qword [rcx], xmm0
+	add			rcx, 8
+	add			rbx, 8
+	sub			r10d, 2
+	ja			.pixelloop_6coeff
+	jnz			.xit
+.oddpixel_6coeff:
+	movd		xmm0, dword [r12+rbx]
+	movd		xmm1, dword [r13+rbx]
+	movd		xmm2, dword [r14+rbx]
+	movd		xmm3, dword [r15+rbx]
+	movd		xmm8, dword [rsi+rbx]
+	movd		xmm9, dword [rdx+rbx]
+
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm8, xmm9
+	punpcklbw	xmm0, xmm5
+	punpcklbw	xmm2, xmm5
+	punpcklbw	xmm8, xmm5
+
+	pmaddwd		xmm0, xmm10
+	pmaddwd		xmm2, xmm11
+	pmaddwd		xmm8, xmm12
+
+	paddd		xmm0, xmm4
+	paddd		xmm2, xmm8
+	paddd		xmm0, xmm2
+
+	psrad		xmm0, 14
+	packssdw	xmm0, xmm0
+	packuswb	xmm0, xmm0
+
+	movd		dword [rcx], xmm0
+
+	jmp			.xit
+endproc_frame
+
+	end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm
new file mode 100644
index 000000000..f3503807e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm
@@ -0,0 +1,812 @@
+		section	.text
+
+		global	_vdasm_pixblt_RGB565_to_XRGB1555
+_vdasm_pixblt_RGB565_to_XRGB1555:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-2]
+		lea		ecx, [ecx+ebp-2]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		mov		eax, [ecx+ebp]
+		mov		ebx, 0ffc0ffc0h
+
+		and		ebx, eax
+		and		eax, 0001f001fh
+
+		shr		ebx, 1
+
+		add		eax, ebx
+
+		mov		[edx+ebp], eax
+		add		ebp, 4
+
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		movzx	eax, word [ecx]
+		mov		ebx, 0ffc0ffc0h
+		and		ebx, eax
+		and		eax, 0001f001fh
+		shr		ebx, 1
+		add		eax, ebx
+		mov		[edx], ax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+
+		global	_vdasm_pixblt_RGB888_to_XRGB1555
+_vdasm_pixblt_RGB888_to_XRGB1555:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		esi,[esp+12+16]
+		mov		edi,[esp+4+16]
+
+		mov		ebp,[esp+20+16]
+		lea		eax,[ebp+ebp]
+		lea		ebx,[ebp+eax]
+		sub		[esp+8+16],eax
+		sub		[esp+16+16],ebx
+
+		mov		edx,[esp+24+16]
+.yloop:
+		mov		ebp,[esp+20+16]
+		push	ebp
+		push	edx
+		shr		ebp,1
+		jz		.checkodd
+.xloop:
+		mov		eax,[esi+2]		;u
+		add		esi,6			;v
+
+		mov		ebx,eax			;u
+		mov		ecx,eax			;v
+		shr		ebx,11			;u
+		and		ecx,00f80000h	;v
+		shr		eax,17			;u
+		and		ebx,0000001fh	;v
+		shr		ecx,14			;u
+		and		eax,00007c00h	;v
+		or		ebx,ecx			;u
+		add		edi,4			;v
+		or		ebx,eax			;u
+
+		mov		ecx,[esi-6]		;v
+		mov		edx,ebx			;u
+		mov		eax,ecx			;v
+
+		shl		edx,16			;u
+		mov		ebx,ecx			;v
+		shr		ebx,3			;u
+		and		ecx,0000f800h	;v
+		shr		eax,9			;u
+		and		ebx,0000001fh	;v
+		shr		ecx,6			;u
+		and		eax,00007c00h	;v
+		or		eax,ecx			;u
+		or		edx,ebx			;v
+		or		edx,eax			;u
+		sub		ebp,1			;v
+		mov		[edi-4],edx		;u
+		jne		.xloop			;v
+.checkodd:
+		pop		edx
+		pop		ebp
+		and		ebp,1
+		jz		.noodd
+		movzx	eax,word [esi]
+		movzx	ebx,byte [esi+2]
+		shl		ebx,16
+		add		esi,3
+		add		eax,ebx
+
+		mov		ebx,eax
+		mov		ecx,eax
+		shr		ebx,3
+		and		ecx,0000f800h
+		shr		eax,9
+		and		ebx,0000001fh
+		shr		ecx,6
+		and		eax,00007c00h
+		or		ebx,ecx
+		or		ebx,eax
+		mov		[edi+0],bl
+		mov		[edi+1],bh
+		add		edi,2
+.noodd:
+
+		add		esi,[esp+16+16]
+		add		edi,[esp+ 8+16]
+
+		sub		edx,1
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+
+		ret
+
+		global	_vdasm_pixblt_XRGB8888_to_XRGB1555
+_vdasm_pixblt_XRGB8888_to_XRGB1555:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		mov		edx, [esp+4+16]
+		add		ebp, ebp
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-2]
+		lea		ecx, [ecx+ebp*2-4]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		mov		eax, [ecx+ebp*2]
+		mov		ebx, 00f80000h
+		and		ebx, eax
+		mov		esi, eax
+		shr		ebx, 9
+		and		esi, 0000f800h
+		shr		esi, 6
+		and		eax, 000000f8h
+		shr		eax, 3
+		add		ebx, esi
+		mov		esi, [ecx+ebp*2+4]
+		add		eax, ebx
+		mov		ebx, esi
+		and		esi, 00f80000h
+		shl		esi, 7
+		mov		edi, ebx
+		and		edi, 0000f800h
+		add		eax, esi
+		shl		edi, 10
+		and		ebx, 000000f8h
+		shl		ebx, 13
+		add		eax, edi
+		add		eax, ebx
+		mov		[edx+ebp], eax
+		add		ebp, 4
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		mov		eax, [ecx]
+		mov		ebx, 00f80000h
+		and		ebx, eax
+		mov		esi, eax
+		shr		ebx, 9
+		and		esi, 0000f800h
+		shr		esi, 6
+		and		eax, 000000f8h
+		shr		eax, 3
+		add		ebx, esi
+		add		eax, ebx
+		mov		[edx], ax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		dword [esp+24+16]
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		global	_vdasm_pixblt_XRGB1555_to_RGB565
+_vdasm_pixblt_XRGB1555_to_RGB565:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-2]
+		lea		ecx, [ecx+ebp-2]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		mov		eax, [ecx+ebp]
+		mov		ebx, 02000200h
+
+		mov		esi, eax
+		and		ebx, eax
+
+		shr		ebx, 4
+		and		esi, 0ffe0ffe0h
+
+		add		eax, esi
+
+		add		eax, ebx
+
+		mov		[edx+ebp], eax
+		add		ebp, 4
+
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		movzx	eax, word [ecx]
+		mov		ebx, 02000200h
+		mov		esi, eax
+		and		ebx, eax
+		shr		ebx, 4
+		and		esi, 0ffe0ffe0h
+		add		eax, esi
+		add		eax, ebx
+		mov		[edx], ax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		global	_vdasm_pixblt_RGB888_to_RGB565
+_vdasm_pixblt_RGB888_to_RGB565:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		esi,[esp+12+16]
+		mov		edi,[esp+4+16]
+
+		mov		ebp,[esp+20+16]
+		lea		eax,[ebp+ebp]
+		lea		ebx,[ebp+eax]
+		sub		[esp+8+16],eax
+		sub		[esp+16+16],ebx
+
+		mov		edx,[esp+24+16]
+.yloop:
+		mov		ebp,[esp+20+16]
+		push	ebp
+		push	edx
+		shr		ebp,1
+		jz		.checkodd
+.xloop:
+		mov		eax,[esi+2]		;u
+		add		esi,6			;v
+
+		mov		ebx,eax			;u
+		mov		ecx,eax			;v
+		shr		ebx,11			;u
+		and		ecx,00fc0000h	;v
+		shr		eax,16			;u
+		and		ebx,0000001fh	;v
+		shr		ecx,13			;u
+		and		eax,0000f800h	;v
+		or		ebx,ecx			;u
+		add		edi,4			;v
+		or		ebx,eax			;u
+
+		mov		ecx,[esi-6]		;v
+		mov		edx,ebx			;u
+		mov		eax,ecx			;v
+
+		shl		edx,16			;u
+		mov		ebx,ecx			;v
+		shr		ebx,3			;u
+		and		ecx,0000fc00h	;v
+		shr		eax,8			;u
+		and		ebx,0000001fh	;v
+		shr		ecx,5			;u
+		and		eax,0000f800h	;v
+		or		eax,ecx			;u
+		or		edx,ebx			;v
+		or		edx,eax			;u
+		sub		ebp,1			;v
+		mov		[edi-4],edx		;u
+		jne		.xloop			;v
+.checkodd:
+		pop		edx
+		pop		ebp
+		and		ebp,1
+		jz		.noodd
+		movzx	eax,word [esi]
+		movzx	ebx,byte [esi+2]
+		shl		ebx,16
+		add		esi,3
+		add		eax,ebx
+
+		mov		ebx,eax
+		mov		ecx,eax
+		shr		ebx,3
+		and		ecx,0000fc00h
+		shr		eax,8
+		and		ebx,0000001fh
+		shr		ecx,5
+		and		eax,0000f800h
+		or		ebx,ecx
+		or		ebx,eax
+		mov		[edi+0],bl
+		mov		[edi+1],bh
+		add		edi,2
+.noodd:
+
+		add		esi,[esp+16+16]
+		add		edi,[esp+ 8+16]
+
+		sub		edx,1
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+
+		ret
+
+		global	_vdasm_pixblt_XRGB8888_to_RGB565
+_vdasm_pixblt_XRGB8888_to_RGB565:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		mov		edx, [esp+4+16]
+		add		ebp, ebp
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-2]
+		lea		ecx, [ecx+ebp*2-4]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		mov		eax, [ecx+ebp*2]
+		mov		ebx, 00f80000h
+		and		ebx, eax
+		mov		esi, eax
+		shr		ebx, 8
+		and		esi, 0000fc00h
+		shr		esi, 5
+		and		eax, 000000f8h
+		shr		eax, 3
+		add		ebx, esi
+		mov		esi, [ecx+ebp*2+4]
+		add		eax, ebx
+		mov		ebx, esi
+		and		esi, 00f80000h
+		shl		esi, 8
+		mov		edi, ebx
+		and		edi, 0000fc00h
+		add		eax, esi
+		shl		edi, 11
+		and		ebx, 000000f8h
+		shl		ebx, 13
+		add		eax, edi
+		add		eax, ebx
+		mov		[edx+ebp], eax
+		add		ebp, 4
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		mov		eax, [ecx]
+		mov		ebx, 00f80000h
+		and		ebx, eax
+		mov		esi, eax
+		shr		ebx, 8
+		and		esi, 0000fc00h
+		shr		esi, 5
+		and		eax, 000000f8h
+		shr		eax, 3
+		add		ebx, esi
+		add		eax, ebx
+		mov		[edx], ax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		dword [esp+24+16]
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+
+		global	_vdasm_pixblt_XRGB8888_to_RGB888
+_vdasm_pixblt_XRGB8888_to_RGB888:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		esi,[esp+12+16]
+		mov		edi,[esp+4+16]
+
+		mov		ecx,[esp+20+16]
+		lea		eax,[ecx+ecx*2]
+		lea		ebx,[ecx*4]
+		sub		[esp+8+16],eax
+		sub		[esp+16+16],ebx
+
+		mov		edx,[esp+24+16]
+.yloop:
+		mov		ecx,[esp+20+16]
+		push	ecx
+		push	edx
+		shr		ecx,2
+		jz		.checkodd
+.xloop:
+		mov		eax,[esi]		;EAX = xxr0g0b0
+		mov		ebx,[esi+4]		;EBX = xxr1g1b1
+		mov		edx,ebx			;EDX = xxr1g1b1
+		mov		ebp,[esi+8]		;EBP = xxr2g2b2
+		shl		ebx,24			;EBX = b1000000
+		and		eax,00ffffffh		;EAX = 00r0g0b0
+		shr		edx,8			;EDX = 00xxr1g1
+		or		eax,ebx			;EAX = b1r0g0b0
+		mov		[edi],eax
+		mov		ebx,ebp			;EBX = xxr2g2b2
+		shl		ebp,16			;EBP = g2b20000
+		and		edx,0000ffffh		;EDX = 0000r1g1
+		or		ebp,edx			;EBP = g2b2r1g1
+		mov		eax,[esi+12]		;EAX = xxr3g3b3
+		shr		ebx,16			;EBX = 0000xxr2
+		add		edi,12
+		shl		eax,8			;EAX = r3g3b300
+		and		ebx,000000ffh		;EBX = 000000r2
+		or		eax,ebx			;EAX = r3g3b3r2
+		mov		[edi+4-12],ebp
+		add		esi,16
+		mov		[edi+8-12],eax
+		sub		ecx,1
+		jne		.xloop
+.checkodd:
+		pop		edx
+		pop		ecx
+		and		ecx,3
+		jz		.noodd
+.oddloop:
+		mov		eax,[esi]
+		add		esi,4
+		mov		[edi],ax
+		shr		eax,16
+		mov		[edi+2],al
+		add		edi,3
+		sub		ecx,1
+		jnz		.oddloop
+.noodd:
+		add		esi,[esp+16+16]
+		add		edi,[esp+ 8+16]
+
+		sub		edx,1
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		global	_vdasm_pixblt_XRGB1555_to_XRGB8888
+_vdasm_pixblt_XRGB1555_to_XRGB8888:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp*2-4]
+		lea		ecx, [ecx+ebp-2]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		mov		eax, [ecx+ebp]
+		mov		ebx, 00007c00h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 9
+		and		esi, 000003e0h
+		shl		esi, 6
+		mov		edi, eax
+		and		eax, 0000001fh
+		add		ebx, esi
+		shl		eax, 3
+		mov		esi, edi
+		shr		edi, 7
+		add		eax, ebx
+		and		edi, 00f80000h
+		mov		ebx, esi
+		shr		esi, 13
+		and		ebx, 03e00000h
+		shr		ebx, 10
+		and		esi, 000000f8h
+		add		ebx, edi
+		add		ebx, esi
+		mov		edi, eax
+		and		eax, 00e0e0e0h
+		shr		eax, 5
+		mov		esi, ebx
+		shr		ebx, 5
+		add		eax, edi
+		and		ebx, 00070707h
+		add		ebx, esi
+		mov		[edx+ebp*2], eax
+		mov		[edx+ebp*2+4], ebx
+		add		ebp, 4
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		movzx	eax, word [ecx]
+		mov		ebx, 00007c00h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 9
+		and		esi, 000003e0h
+		shl		esi, 6
+		and		eax, 0000001fh
+		shl		eax, 3
+		add		ebx, esi
+		add		eax, ebx
+		mov		ebx, 00e0e0e0h
+		and		ebx, eax
+		shr		ebx, 5
+		add		eax, ebx
+		mov		[edx], eax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		dword [esp+24+16]
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+
+		global	_vdasm_pixblt_RGB565_to_XRGB8888
+_vdasm_pixblt_RGB565_to_XRGB8888:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		ebp, [esp+20+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp*2-4]
+		lea		ecx, [ecx+ebp-2]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 2
+		jbe		.odd
+
+.xloop:
+		movzx	eax, word [ecx+ebp]
+		mov		ebx, 0000f800h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 8
+		mov		edi, eax
+		shl		eax, 3
+		and		esi, 000007e0h
+		and		eax, 000000f8h
+		add		ebx, eax
+		shl		esi, 5
+		mov		eax, ebx
+		shr		ebx, 5
+		and		edi, 00000600h
+		shr		edi, 1
+		and		ebx, 00070007h
+		add		esi, edi
+		add		eax, ebx
+		add		eax, esi
+		mov		[edx+ebp*2], eax
+
+		movzx	eax, word [ecx+ebp+2]
+		mov		ebx, 0000f800h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 8
+		mov		edi, eax
+		shl		eax, 3
+		and		esi, 000007e0h
+		and		eax, 000000f8h
+		add		ebx, eax
+		shl		esi, 5
+		mov		eax, ebx
+		shr		ebx, 5
+		and		edi, 00000600h
+		shr		edi, 1
+		and		ebx, 00070007h
+		add		esi, edi
+		add		eax, ebx
+		add		eax, esi
+		mov		[edx+ebp*2+4], eax
+
+		add		ebp, 4
+
+		jnc		.xloop
+		jnz		.noodd
+.odd:
+		movzx	eax, word [ecx]
+		mov		ebx, 0000f800h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 8
+		mov		edi, eax
+		shl		eax, 3
+		and		esi, 000007e0h
+		and		eax, 000000f8h
+		add		ebx, eax
+		shl		esi, 5
+		mov		eax, ebx
+		shr		ebx, 5
+		and		edi, 00000600h
+		shr		edi, 1
+		and		ebx, 00070007h
+		add		esi, edi
+		add		eax, ebx
+		add		eax, esi
+		mov		[edx], eax
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		dword [esp+24+16]
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+
+		global	_vdasm_pixblt_RGB888_to_XRGB8888
+_vdasm_pixblt_RGB888_to_XRGB8888:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		esi,[esp+12+16]
+		mov		edi,[esp+4+16]
+
+		mov		ecx,[esp+20+16]
+		lea		eax,[ecx+ecx*2]
+		lea		ebx,[ecx*4]
+		sub		[esp+8+16],ebx
+		sub		[esp+16+16],eax
+
+		mov		edx,[esp+24+16]
+.yloop:
+		mov		ebp,[esp+20+16]
+		shr		ebp,2
+		push	edx
+		jz		.checkodd
+.xloop:
+		mov		eax,[esi]			;EAX: b1r0g0b0
+		mov		ebx,[esi+4]			;EBX: g2b2r1g1
+
+		mov		[edi],eax
+		mov		ecx,ebx				;ECX: g2b2r1g1
+
+		shr		eax,24				;EAX: ------b1
+		mov		edx,[esi+8]			;EDX: r3g3b3r2
+
+		shr		ecx,16				;ECX: ----g2b2
+		add		edi,16
+
+		shl		ebx,8				;EBX: b2r1g1--
+		add		esi,12
+
+		or		eax,ebx				;EAX: b2r1g1b1
+		mov		ebx,edx				;EBX: r3g3b3r2
+
+		shr		ebx,8				;EBX: --r3g3b3
+		mov		[edi+4-16],eax
+
+		shl		edx,16				;EDX: b3r2----
+		mov		[edi+12-16],ebx
+
+		or		edx,ecx				;EDX: b3r2g2b2
+		sub		ebp,1
+
+		mov		[edi+8-16],edx
+		jne		.xloop
+
+.checkodd:
+		pop		edx
+		mov		ebx,[esp+20+16]
+		and		ebx,3
+		jz		.noodd
+.oddloop:
+		mov		ax,[esi]
+		mov		cl,[esi+2]
+		mov		[edi],ax
+		mov		[edi+2],cl
+		add		esi,3
+		add		edi,4
+		sub		ebx,1
+		jne		.oddloop
+.noodd:
+
+		add		esi,[esp+16+16]
+		add		edi,[esp+ 8+16]
+
+		sub		edx,1
+		jne		.yloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm
new file mode 100644
index 000000000..6a00d826f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm
@@ -0,0 +1,652 @@
+		section	.rdata, rdata
+
+y_co	dq		0004a004a004a004ah
+cr_co_r	dq		000cc00cc00cc00cch
+cb_co_b	dq		00081008100810081h		;note: divided by two
+cr_co_g	dq		0ff98ff98ff98ff98h
+cb_co_g	dq		0ffceffceffceffceh
+y_bias	dq		0fb7afb7afb7afb7ah
+c_bias	dq		0ff80ff80ff80ff80h
+interp	dq		06000400020000000h
+rb_mask_555	dq		07c1f7c1f7c1f7c1fh
+g_mask_555	dq		003e003e003e003e0h
+rb_mask_565	dq		0f81ff81ff81ff81fh
+g_mask_565	dq		007e007e007e007e0h
+
+cr_coeff	dq	000003313e5fc0000h
+cb_coeff	dq	000000000f377408dh
+rgb_bias	dq	000007f2180887eebh
+
+msb_inv	dq		08000800080008000h
+
+		section	.text
+
+;============================================================================
+
+%macro YUV411PLANAR_TO_RGB_PROLOG 0
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+4+16]
+		mov			ecx, [esp+8+16]
+		mov			edx, [esp+12+16]
+		mov			ebx, [esp+16+16]
+		mov			ebp, [esp+20+16]
+
+		pxor		mm7, mm7
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_CORE_MMX 0
+		movd		mm0, dword [ecx]		;mm0 = Y3Y2Y1Y0
+		add			ecx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		movq		mm1, mm0
+		pmullw		mm0, [y_co]
+		paddw		mm1, [y_bias]
+		paddsw		mm0, mm0
+		paddsw		mm0, mm1
+
+		movzx		esi, word [ebx]
+		movzx		edi, word [edx]
+		add			ebx, 1
+		add			edx, 1
+
+		movd		mm1, esi
+		movd		mm2, edi
+
+		punpcklbw	mm1, mm7
+		paddw		mm1, [c_bias]
+		punpcklwd	mm1, mm1
+		movq		mm3, mm1
+		punpckldq	mm1, mm1
+		punpckhdq	mm3, mm3
+
+		punpcklbw	mm2, mm7
+		paddw		mm2, [c_bias]
+		punpcklwd	mm2, mm2
+		movq		mm4, mm2
+		punpckldq	mm2, mm2
+		punpckhdq	mm4, mm4
+
+		psubw		mm3, mm1
+		psubw		mm4, mm2
+		paddw		mm3, mm3
+		paddw		mm4, mm4
+
+		pmulhw		mm3, [interp]
+		pmulhw		mm4, [interp]
+
+		paddw		mm1, mm3
+		paddw		mm2, mm4
+
+		movq		mm3, mm1
+		movq		mm4, mm2
+
+		pmullw		mm1, [cr_co_r]
+		pmullw		mm2, [cb_co_b]
+		pmullw		mm3, [cr_co_g]
+		pmullw		mm4, [cb_co_g]
+
+		paddsw		mm2, mm2
+		paddsw		mm1, mm0
+		paddsw		mm3, mm4
+		paddsw		mm2, mm0
+		paddsw		mm3, mm0
+
+		psraw		mm1, 7
+		psraw		mm2, 7
+		psraw		mm3, 7
+
+		packuswb	mm1, mm1
+		packuswb	mm2, mm2
+		packuswb	mm3, mm3
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_CORE_ISSE 0
+		movd		mm0, dword [ecx]		;mm0 = Y3Y2Y1Y0
+		add			ecx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		movq		mm1, mm0
+		pmullw		mm0, [y_co]
+		paddw		mm1, [y_bias]
+		paddsw		mm0, mm0
+		paddsw		mm0, mm1
+
+		movzx		esi, word [ebx]
+		movzx		edi, word [edx]
+		add			ebx, 1
+		add			edx, 1
+
+		movd		mm1, esi
+		movd		mm2, edi
+
+		punpcklbw	mm1, mm7
+		paddw		mm1, [c_bias]
+		pshufw		mm3, mm1, 01010101b
+		pshufw		mm1, mm1, 00000000b
+
+		punpcklbw	mm2, mm7
+		paddw		mm2, [c_bias]
+		pshufw		mm4, mm2, 01010101b
+		pshufw		mm2, mm2, 00000000b
+
+		psubw		mm3, mm1
+		psubw		mm4, mm2
+		paddw		mm3, mm3
+		paddw		mm4, mm4
+
+		pmulhw		mm3, [interp]
+		pmulhw		mm4, [interp]
+
+		paddw		mm1, mm3
+		paddw		mm2, mm4
+
+		psllw		mm1, 3
+		psllw		mm2, 3
+
+		movq		mm3, [cr_co_g]
+		movq		mm4, [cb_co_g]
+
+		pmullw		mm3, mm1
+		pmullw		mm4, mm2
+		pmullw		mm1, [cr_co_r]
+		pmullw		mm2, [cb_co_b]
+
+		paddsw		mm2, mm2
+		paddsw		mm1, mm0
+		paddsw		mm3, mm4
+		paddsw		mm2, mm0
+		paddsw		mm3, mm0
+
+		psraw		mm1, 7
+		psraw		mm2, 7
+		psraw		mm3, 7
+
+		packuswb	mm1, mm1
+		packuswb	mm2, mm2
+		packuswb	mm3, mm3
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_EPILOG 0
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+%endmacro
+
+	global	_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX:
+		YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+		YUV411PLANAR_TO_RGB_CORE_MMX
+
+		psrlw		mm1, 1
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 2
+		pand		mm2, [rb_mask_555]
+		pand		mm3, [g_mask_555]
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+	global	_vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX:
+		YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+		YUV411PLANAR_TO_RGB_CORE_MMX
+
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 3
+		pand		mm2, [rb_mask_565]
+		pand		mm3, [g_mask_565]
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+	global	_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX:
+		YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+		YUV411PLANAR_TO_RGB_PROLOG
+
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		movq		mm1, mm2
+		punpcklbw	mm1, mm3
+		punpckhbw	mm2, mm3
+
+		movq		[eax], mm1
+		movq		[eax+8], mm2
+		add			eax, 16
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+	global	_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE:
+		YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+		YUV411PLANAR_TO_RGB_CORE_ISSE
+
+		psrlw		mm1, 1
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 2
+		pand		mm2, [rb_mask_555]
+		pand		mm3, [g_mask_555]
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+	global	_vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE:
+		YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+		YUV411PLANAR_TO_RGB_CORE_ISSE
+
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 3
+		pand		mm2, [rb_mask_565]
+		pand		mm3, [g_mask_565]
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+	global	_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+4+16]
+		mov			ecx, [esp+8+16]
+		mov			edx, [esp+12+16]
+		mov			ebx, [esp+16+16]
+		mov			ebp, [esp+20+16]
+
+		pxor		mm7, mm7
+
+		movzx		esi, byte [ebx]
+		movzx		edi, byte [edx]
+		add			ebx, 1
+		add			edx, 1
+
+		movd		mm1, esi
+		movd		mm2, edi
+
+		psllw		mm1, 3
+		psllw		mm2, 3
+
+		pshufw		mm5, mm1, 0
+		pshufw		mm6, mm2, 0
+
+		pmulhw		mm5, [cr_coeff]
+		pmulhw		mm6, [cb_coeff]
+		paddw		mm6, mm5
+		paddw		mm6, [rgb_bias]
+
+.xloop:
+		movd		mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+		add			ecx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		psllw		mm0, 3
+		pmulhw		mm0, [y_co]
+		pxor		mm0, [msb_inv]
+
+		movzx		esi, byte [ebx]
+		movzx		edi, byte [edx]
+		add			ebx, 1
+		add			edx, 1
+
+		movd		mm1, esi
+		movd		mm2, edi
+
+		psllw		mm1, 3
+		psllw		mm2, 3
+
+		pshufw		mm1, mm1, 0
+		pshufw		mm2, mm2, 0
+
+		pmulhw		mm1, [cr_coeff]
+		pmulhw		mm2, [cb_coeff]
+		paddw		mm1, mm2
+		paddw		mm1, [rgb_bias]
+
+		movq		mm2, mm1
+		pavgw		mm2, mm6			;mm2 = 1/2
+		pshufw		mm3, mm0, 00000000b
+		paddw		mm3, mm6
+		pavgw		mm6, mm2			;mm1 = 1/4
+		pshufw		mm4, mm0, 01010101b
+		paddw		mm4, mm6
+		packuswb	mm3, mm4
+		movq		[eax], mm3
+
+		pshufw		mm3, mm0, 10101010b
+		paddw		mm3, mm2
+		pshufw		mm0, mm0, 11111111b
+		pavgw		mm2, mm1			;mm2 = 3/4
+		paddw		mm2, mm0
+		packuswb	mm3, mm2
+		movq		[eax+8], mm3
+
+		movq		mm6, mm1
+
+		add			eax, 16
+
+		sub			ebp, 1
+		jne			.xloop
+
+		YUV411PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+%macro YUV444PLANAR_TO_RGB_PROLOG 0
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+4+16]
+		mov			ecx, [esp+8+16]
+		mov			edx, [esp+12+16]
+		mov			ebx, [esp+16+16]
+		mov			ebp, [esp+20+16]
+%endmacro
+
+%macro YUV444PLANAR_TO_RGB_CORE 0
+		movq		mm3, mm0
+		pmullw		mm0, [y_co]
+		paddw		mm1, [c_bias]
+		paddw		mm2, [c_bias]
+		paddw		mm0, [y_bias]
+		paddsw		mm0, mm0
+		paddsw		mm0, mm3
+
+		movq		mm3, [cr_co_g]
+		movq		mm4, [cb_co_g]
+
+		pmullw		mm3, mm1
+		pmullw		mm4, mm2
+		pmullw		mm1, [cr_co_r]
+		pmullw		mm2, [cb_co_b]
+
+		paddsw		mm2, mm2
+		paddsw		mm1, mm0
+		paddsw		mm3, mm4
+		paddsw		mm2, mm0
+		paddsw		mm3, mm0
+
+		psraw		mm1, 7
+		psraw		mm2, 7
+		psraw		mm3, 7
+
+		packuswb	mm1, mm1
+		packuswb	mm2, mm2
+		packuswb	mm3, mm3
+%endmacro
+
+%macro YUV444PLANAR_TO_RGB_EPILOG 0
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+%endmacro
+
+;==========================================================================
+
+	global	_vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX:
+		YUV444PLANAR_TO_RGB_PROLOG
+
+		pxor		mm7, mm7
+		movq		mm5, [rb_mask_555]
+		movq		mm6, [g_mask_555]
+
+		sub			ebp, 3
+		jbe			.oddcheck
+.xloop4:
+		movd		mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+		movd		mm1, dword [ebx]
+		movd		mm2, dword [edx]
+		add			ecx, 4
+		add			ebx, 4
+		add			edx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		punpcklbw	mm1, mm7
+		punpcklbw	mm2, mm7
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		psrlw		mm1, 1
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 2
+		pand		mm2, mm5
+		pand		mm3, mm6
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 4
+		ja			.xloop4
+.oddcheck:
+		add			ebp, 3
+		jz			.noodd
+.xloop:
+		movzx		edi, byte [ecx]			;mm0 = Y3Y2Y1Y0
+		movd		mm0, edi
+		movzx		edi, byte [ebx]
+		movd		mm1, edi
+		movzx		edi, byte [edx]
+		movd		mm2, edi
+		add			ecx, 1
+		add			ebx, 1
+		add			edx, 1
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		psrlw		mm1, 1
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 2
+		pand		mm2, mm5
+		pand		mm3, mm6
+		por			mm2, mm3
+
+		movd		edi, mm2
+		mov			[eax], di
+		add			eax, 2
+
+		sub			ebp, 1
+		jnz			.xloop
+.noodd:
+		YUV444PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+	global	_vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX:
+		YUV444PLANAR_TO_RGB_PROLOG
+
+		pxor		mm7, mm7
+		movq		mm5, [rb_mask_565]
+		movq		mm6, [g_mask_565]
+
+		sub			ebp, 3
+		jbe			.oddcheck
+.xloop4:
+		movd		mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+		movd		mm1, dword [ebx]
+		movd		mm2, dword [edx]
+		add			ecx, 4
+		add			ebx, 4
+		add			edx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		punpcklbw	mm1, mm7
+		punpcklbw	mm2, mm7
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 3
+		pand		mm2, mm5
+		pand		mm3, mm6
+		por			mm2, mm3
+
+		movq		[eax], mm2
+		add			eax, 8
+
+		sub			ebp, 4
+		ja			.xloop4
+.oddcheck:
+		add			ebp, 3
+		jz			.noodd
+.xloop:
+		movzx		edi, byte [ecx]			;mm0 = Y3Y2Y1Y0
+		movd		mm0, edi
+		movzx		edi, byte [ebx]
+		movd		mm1, edi
+		movzx		edi, byte [edx]
+		movd		mm2, edi
+		add			ecx, 1
+		add			ebx, 1
+		add			edx, 1
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		psrlw		mm2, 3
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		psllw		mm3, 3
+		pand		mm2, mm5
+		pand		mm3, mm6
+		por			mm2, mm3
+
+		movd		edi, mm2
+		mov			[eax], di
+		add			eax, 2
+
+		sub			ebp, 1
+		jnz			.xloop
+.noodd:
+		YUV444PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+	global	_vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX:
+		YUV444PLANAR_TO_RGB_PROLOG
+
+		pxor		mm7, mm7
+
+		sub			ebp, 3
+		jbe			.oddcheck
+.xloop4:
+		movd		mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+		movd		mm1, dword [ebx]
+		movd		mm2, dword [edx]
+		add			ecx, 4
+		add			ebx, 4
+		add			edx, 4
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+		punpcklbw	mm1, mm7
+		punpcklbw	mm2, mm7
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		movq		mm1, mm2
+		punpcklbw	mm1, mm3
+		punpckhbw	mm2, mm3
+
+		movq		[eax], mm1
+		movq		[eax+8], mm2
+		add			eax, 16
+
+		sub			ebp, 4
+		ja			.xloop4
+.oddcheck:
+		add			ebp, 3
+		jz			.noodd
+.xloop:
+		movzx		edi, byte [ecx]			;mm0 = Y3Y2Y1Y0
+		movd		mm0, edi
+		movzx		edi, byte [ebx]
+		movd		mm1, edi
+		movzx		edi, byte [edx]
+		movd		mm2, edi
+		add			ecx, 1
+		add			ebx, 1
+		add			edx, 1
+		punpcklbw	mm0, mm7			;mm0 = Y3 | Y2 | Y1 | Y0
+
+		YUV444PLANAR_TO_RGB_CORE
+
+		punpcklbw	mm2, mm1
+		punpcklbw	mm3, mm3
+		punpcklbw	mm2, mm3
+
+		movd		dword [eax], mm2
+		add			eax, 4
+
+		sub			ebp, 1
+		jnz			.xloop
+.noodd:
+		YUV444PLANAR_TO_RGB_EPILOG
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm
new file mode 100644
index 000000000..aa0b99987
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm
@@ -0,0 +1,806 @@
+		section	.rdata, rdata
+
+x07b		dq		00707070707070707h
+x0200w		dq		00200020002000200h
+x001fw		dq		0001f001f001f001fh
+xffc0w		dq		0ffc0ffc0ffc0ffc0h
+xffe0w		dq		0ffe0ffe0ffe0ffe0h
+x2080w		dq		02080208020802080h
+x4200w		dq		04200420042004200h
+rb_mask5	dq		000f800f800f800f8h
+g_mask5		dq		00000f8000000f800h
+g_mask6		dq		00000fc000000fc00h
+rb_mul_565	dq		02000000420000004h
+rb_mul_555	dq		02000000820000008h
+r_mask_555	dq		07c007c007c007c00h
+g_mask_555	dq		003e003e003e003e0h
+b_mask_555	dq		0001f001f001f001fh
+r_mask_565	dq		0f800f800f800f800h
+g_mask_565	dq		007e007e007e007e0h
+b_mask_565	dq		0001f001f001f001fh
+
+%macro prologue 1
+			push	ebx
+			push	esi
+			push	edi
+			push	ebp
+			;.fpo	(0,%1,4,4,1,0)
+%endmacro
+
+%macro epilogue 0
+			pop		ebp
+			pop		edi
+			pop		esi
+			pop		ebx
+%endmacro
+
+		section	.text
+
+	global	_vdasm_pixblt_RGB565_to_XRGB1555_MMX
+_vdasm_pixblt_RGB565_to_XRGB1555_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-6]
+		lea		ecx, [ecx+ebp-6]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq	mm5, [x001fw]
+		movq	mm4, [xffc0w]
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 6
+		jbe		.odd
+
+.xloop:
+		movq	mm0, [ecx+ebp]
+		movq	mm1, mm5
+		pand	mm1, mm0
+		pand	mm0, mm4
+		psrlq	mm0, 1
+		paddw	mm0, mm1
+		movq	[edx+ebp], mm0
+		add		ebp, 8
+		jnc		.xloop
+
+		sub		ebp, 6
+		jz		.noodd
+.odd:
+		movzx	eax, word [ecx+ebp+6]
+		mov		ebx, 0001f001fh
+		and		ebx, eax
+		and		eax, 0ffc0ffc0h
+		shr		eax, 1
+		add		eax, ebx
+		mov		[edx+ebp+6], ax
+		add		ebp, 2
+		jnz		.odd
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+	global	_vdasm_pixblt_XRGB8888_to_XRGB1555_MMX
+_vdasm_pixblt_XRGB8888_to_XRGB1555_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-14]
+		lea		ecx, [ecx+ebp*2-28]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq		mm5,[rb_mul_555]
+		movq		mm6,[rb_mask5]
+		movq		mm7,[g_mask5]
+
+.yloop:
+		mov			ebp, [esp+20+16]
+		add			ebp, 14
+		jbe			.odd
+
+		;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
+		;Application Notes.
+
+		movq		mm0,[ecx+ebp*2]		;allocate 0	(0123)
+		movq		mm2,mm0				;allocate 2	(0 23)
+
+		movq		mm1,[ecx+ebp*2+8]	;allocate 1	(0123)
+		movq		mm3,mm1				;allocate 3	(0123)
+		pand		mm0,mm6
+		pmaddwd		mm0,mm5
+		pand		mm1,mm6
+		pmaddwd		mm1,mm5
+		pand		mm2,mm7
+		pand		mm3,mm7
+		jmp			.xloopstart
+
+		align 16
+.xloop:
+		movq		mm0,[ecx+ebp*2]		;allocate 0	(01234)
+		por			mm4,mm2				;free 2		(01 34)
+
+		por			mm3,mm1				;free 3		(01 34)
+		movq		mm2,mm0				;allocate 2	(0 234)
+
+		movq		mm1,[ecx+ebp*2+8]	;allocate 1	(01234)
+		psrld		mm4,6
+
+		psrld		mm3,6
+		pand		mm0,mm6
+
+		packssdw	mm4,mm3				;free 3		(012 4)
+		movq		mm3,mm1				;allocate 3	(01234)
+
+		pmaddwd		mm0,mm5
+		pand		mm1,mm6
+
+		pmaddwd		mm1,mm5
+		pand		mm2,mm7
+
+		movq		[edx+ebp-8],mm4		;free 4		(0123 )
+		pand		mm3,mm7
+
+.xloopstart:
+		movq		mm4,[ecx+ebp*2+16]	;allocate 4	(01234)
+		por			mm0,mm2				;free 2		(01 34)
+
+		por			mm1,mm3				;free 3		(01  4)
+		psrld		mm0,6
+
+		movq		mm3,[ecx+ebp*2+24]	;allocate 3	(01 34)
+		movq		mm2,mm4				;allocate 2	(01234)
+
+		psrld		mm1,6
+		pand		mm4,mm6
+
+		packssdw	mm0,mm1				;free 1		(0 234)
+		movq		mm1,mm3				;allocate 1	(01234)
+
+		movq		[edx+ebp],mm0		;free 0		( 1234)
+		pand		mm3,mm6
+
+		pmaddwd		mm4,mm5
+		add			ebp,16
+
+		pmaddwd		mm3,mm5
+		pand		mm2,mm7
+
+		pand		mm1,mm7
+		jnc			.xloop
+
+		por			mm4,mm2				;free 2		(01 34)
+		por			mm3,mm1				;free 3		(01 34)
+		psrld		mm4,6
+		psrld		mm3,6
+		packssdw	mm4,mm3				;free 3		(012 4)
+		movq		[edx+ebp-8],mm4		;free 4		(0123 )
+
+.odd:
+		sub			ebp, 14
+		jz			.noodd
+.oddloop:
+		mov			eax, [ecx+ebp*2+28]
+		mov			ebx, 00f80000h
+		mov			esi, eax
+		and			ebx, eax
+		shr			ebx, 9
+		and			esi, 0000f800h
+		shr			esi, 6
+		and			eax, 000000f8h
+		shr			eax, 3
+		add			esi, ebx
+		add			eax, esi
+		mov			[edx+ebp+14], ax
+		add			ebp, 2
+		jnz			.oddloop
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+	global	_vdasm_pixblt_XRGB1555_to_RGB565_MMX
+_vdasm_pixblt_XRGB1555_to_RGB565_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-6]
+		lea		ecx, [ecx+ebp-6]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq	mm5, [x0200w]
+		movq	mm4, [xffe0w]
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 6
+		jbe		.odd
+
+.xloop:
+		movq	mm0, [ecx+ebp]
+		movq	mm1, mm4
+		movq	mm2, mm0
+		pand	mm1, mm0
+		pand	mm0, mm5
+		paddw	mm1, mm2
+		psrlq	mm0, 4
+		paddw	mm0, mm1
+		movq	[edx+ebp], mm0
+		add		ebp, 8
+		jnc		.xloop
+
+.odd:
+		sub		ebp, 6
+		jz		.noodd
+.oddloop:
+		movzx	eax, word [ecx+ebp+6]
+		mov		ebx, 02000200h
+		mov		esi, eax
+		and		ebx, eax
+		shr		ebx, 4
+		and		esi, 0ffe0ffe0h
+		add		eax, esi
+		add		eax, ebx
+		mov		[edx+ebp+6], ax
+		add		ebp, 2
+		jnz		.oddloop
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+
+	global	_vdasm_pixblt_XRGB8888_to_RGB565_MMX
+_vdasm_pixblt_XRGB8888_to_RGB565_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp-14]
+		lea		ecx, [ecx+ebp*2-28]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq		mm5,[rb_mul_565]
+		movq		mm6,[rb_mask5]
+		movq		mm7,[g_mask6]
+
+.yloop:
+		mov			ebp, [esp+20+16]
+		add			ebp, 14
+		jbe			.odd
+
+		;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
+		;Application Notes.
+
+		movq		mm0,[ecx+ebp*2]		;allocate 0	(0123)
+		movq		mm2,mm0				;allocate 2	(0 23)
+
+		movq		mm1,[ecx+ebp*2+8]	;allocate 1	(0123)
+		movq		mm3,mm1				;allocate 3	(0123)
+		pand		mm0,mm6
+		pmaddwd		mm0,mm5
+		pand		mm1,mm6
+		pmaddwd		mm1,mm5
+		pand		mm2,mm7
+		pand		mm3,mm7
+		jmp			.xloopstart
+
+		align 16
+.xloop:
+		movq		mm0,[ecx+ebp*2]		;allocate 0	(01234)
+		por			mm4,mm2				;free 2		(01 34)
+
+		por			mm3,mm1				;free 3		(01 34)
+		pslld		mm4,16-5
+
+		pslld		mm3,16-5
+		movq		mm2,mm0				;allocate 2	(0 234)
+
+		movq		mm1,[ecx+ebp*2+8]	;allocate 1	(01234)
+		psrad		mm4,16
+
+		psrad		mm3,16
+		pand		mm0,mm6
+
+		packssdw	mm4,mm3				;free 3		(012 4)
+		movq		mm3,mm1				;allocate 3	(01234)
+
+		pmaddwd		mm0,mm5
+		pand		mm1,mm6
+
+		pmaddwd		mm1,mm5
+		pand		mm2,mm7
+
+		movq		[edx+ebp-8],mm4		;free 4		(0123 )
+		pand		mm3,mm7
+
+.xloopstart:
+		movq		mm4,[ecx+ebp*2+16]	;allocate 4	(01234)
+		por			mm0,mm2				;free 2		(01 34)
+
+		por			mm1,mm3				;free 3		(01  4)
+		pslld		mm0,16-5
+
+		movq		mm3,[ecx+ebp*2+24]	;allocate 3	(01 34)
+		pslld		mm1,16-5
+
+		psrad		mm0,16
+		movq		mm2,mm4				;allocate 2	(01234)
+
+		psrad		mm1,16
+		pand		mm4,mm6
+
+		packssdw	mm0,mm1				;free 1		(0 234)
+		movq		mm1,mm3				;allocate 1	(01234)
+
+		movq		[edx+ebp],mm0		;free 0		( 1234)
+		pand		mm3,mm6
+
+		pmaddwd		mm4,mm5
+		add			ebp,16
+
+		pmaddwd		mm3,mm5
+		pand		mm2,mm7
+
+		pand		mm1,mm7
+		jnc			.xloop
+
+		por			mm4,mm2				;free 2		(01 34)
+		por			mm3,mm1				;free 3		(01 34)
+		psllq		mm4,16-5
+		psllq		mm3,16-5
+		psrad		mm4,16
+		psrad		mm3,16
+		packssdw	mm4,mm3				;free 3		(012 4)
+		movq		[edx+ebp-8],mm4		;free 4		(0123 )
+
+.odd:
+		sub			ebp, 14
+		jz			.noodd
+.oddloop:
+		mov			eax, [ecx+ebp*2+28]
+		mov			ebx, 00f80000h
+		mov			esi, eax
+		and			ebx, eax
+		and			eax, 000000f8h
+		shr			eax, 3
+		and			esi, 0000fc00h
+		shr			ebx, 8
+		shr			esi, 5
+		add			eax, ebx
+		add			eax, esi
+		mov			[edx+ebp+14], ax
+		add			ebp, 2
+		jnz			.oddloop
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+	global	_vdasm_pixblt_XRGB8888_to_RGB888_MMX
+_vdasm_pixblt_XRGB8888_to_RGB888_MMX:
+		prologue	6
+
+		mov			esi,[esp+12+16]
+		mov			edi,[esp+4+16]
+
+		mov			ecx,[esp+20+16]
+		lea			eax,[ecx+ecx*2]
+		lea			ebx,[ecx*4]
+		sub			[esp+8+16],eax
+		sub			[esp+16+16],ebx
+		
+		pcmpeqb		mm7,mm7
+		psrld		mm7,8
+		movq		mm6,mm7
+		psllq		mm7,32			;mm7 = high rgb mask
+		psrlq		mm6,32			;mm6 = low rgb mask
+		
+		mov			ebp,[esp+20+16]
+		mov			edx,[esp+24+16]
+		mov			eax,[esp+16+16]
+		mov			ebx,[esp+ 8+16]
+.yloop:
+		mov			ecx,ebp
+		shr			ecx,3
+		jz			.checkodd
+.xloop:
+		movq		mm0,[esi]		;mm0 = a1r1g1b1a0r0g0b0
+		movq		mm1,mm6
+		
+		movq		mm2,[esi+8]		;mm2 = a3r3g3b3a2r2g2b2
+		pand		mm1,mm0			;mm1 = ----------r0g0b0
+		
+		movq		mm3,mm6
+		pand		mm0,mm7			;mm0 = --r1g1b1--------
+		
+		movq		mm4,mm2
+		pand		mm3,mm2			;mm3 = ----------r2g2b2
+		
+		psrlq		mm0,8			;mm0 = ----r1g1b1------
+		pand		mm2,mm7			;mm2 = --r3g3b3--------
+		
+		movq		mm5,[esi+16]	;mm5 = a5r5g5b5a4r4g4b4
+		psllq		mm4,48			;mm4 = g2b2------------
+		
+		por			mm0,mm1			;mm0 = ----r1g1b1r0g0b0
+		psrlq		mm3,16			;mm3 = --------------r2
+		
+		por			mm0,mm4			;mm0 = g2b2r1g1b1r0g0b0
+		movq		mm1,mm6
+		
+		pand		mm1,mm5			;mm1 = ----------r4g4b4
+		psrlq		mm2,24			;mm2 = --------r3g3b3--
+		
+		movq		[edi],mm0
+		pand		mm5,mm7			;mm5 = --r5g5b5--------
+		
+		psllq		mm1,32			;mm1 = --r4g4b4--------
+		movq		mm4,mm5			;mm4 = --r5g5b5--------
+		
+		por			mm2,mm3			;mm2 = --------r3g3b3r2
+		psllq		mm5,24			;mm5 = b5--------------
+		
+		movq		mm3,[esi+24]	;mm3 = a7r7g7b7a6r6g6b6
+		por			mm2,mm1			;mm2 = --r4g4b4r3g3b3r2
+		
+		movq		mm1,mm6
+		por			mm2,mm5			;mm2 = b5r4g4b4r3g3b3r2
+		
+		psrlq		mm4,40			;mm4 = ------------r5g5
+		pand		mm1,mm3			;mm1 = ----------r6g6b6
+		
+		psllq		mm1,16			;mm1 = ------r6g6b6----	
+		pand		mm3,mm7			;mm3 = --r7g7b7--------
+		
+		por			mm4,mm1			;mm4 = ------r6g6b6r5g5
+		psllq		mm3,8			;mm3 = r7g7b7----------
+		
+		movq		[edi+8],mm2
+		por			mm4,mm3			;mm4 = r7g7b7r6g6b6r5g5
+		
+		add			esi,32
+		sub			ecx,1
+		
+		movq		[edi+16],mm4	;mm3
+
+		lea			edi,[edi+24]
+		jne			.xloop
+	
+.checkodd:
+		mov			ecx,ebp
+		and			ecx,7
+		jz			.noodd
+		movd		mm0,eax
+.oddloop:
+		mov			eax,[esi]
+		add			esi,4
+		mov			[edi],ax
+		shr			eax,16
+		mov			[edi+2],al
+		add			edi,3
+		sub			ecx,1
+		jnz			.oddloop
+		movd		eax,mm0
+.noodd:
+		add			esi,eax
+		add			edi,ebx
+
+		sub			edx,1
+		jne			.yloop
+
+		emms
+
+		epilogue
+		ret
+
+	global	_vdasm_pixblt_XRGB1555_to_XRGB8888_MMX
+_vdasm_pixblt_XRGB1555_to_XRGB8888_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp*2-12]
+		lea		ecx, [ecx+ebp-6]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq	mm5, [r_mask_555]
+		movq	mm6, [g_mask_555]
+		movq	mm7, [b_mask_555]
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 6
+		jbe		.odd
+
+.xloop:
+		movq		mm0, [ecx+ebp]
+		movq		mm1, mm6
+		movq		mm2, mm7
+		pand		mm1, mm0
+		pand		mm2, mm0
+		pand		mm0, mm5
+
+		paddw		mm0, mm0
+		pmulhw		mm1, [x4200w]
+		psllq		mm2, 3
+		paddw		mm0, mm2
+		movq		mm2, mm0
+		psrlw		mm0, 5
+		pand		mm0, [x07b]
+		paddw		mm0, mm2
+		movq		mm2, mm0
+		punpcklbw	mm0, mm1
+		punpckhbw	mm2, mm1
+
+		movq	[edx+ebp*2], mm0
+		movq	[edx+ebp*2+8], mm2
+		add		ebp, 8
+		jnc		.xloop
+.odd:
+		sub		ebp, 6
+		jz		.noodd
+.oddloop:
+		movzx	eax, word [ecx+ebp+6]
+		mov		ebx, 03e0h
+		mov		esi, 001fh
+		and		ebx, eax
+		and		esi, eax
+		and		eax, 07c00h
+		shl		esi, 3
+		shl		ebx, 6
+		shl		eax, 9
+		add		ebx, esi
+		add		eax, ebx
+		mov		ebx, eax
+		shr		eax, 5
+		and		eax, 070707h
+		add		eax, ebx
+		mov		[edx+ebp*2+12], eax
+		add		ebp, 2
+		jnz		.oddloop
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+
+	global	_vdasm_pixblt_RGB565_to_XRGB8888_MMX
+_vdasm_pixblt_RGB565_to_XRGB8888_MMX:
+		prologue	6
+
+		mov		ebp, [esp+20+16]
+		mov		edi, [esp+24+16]
+		add		ebp, ebp
+		mov		edx, [esp+4+16]
+		mov		ecx, [esp+12+16]
+		lea		edx, [edx+ebp*2-12]
+		lea		ecx, [ecx+ebp-6]
+		neg		ebp
+		mov		[esp+20+16], ebp
+
+		movq	mm5, [r_mask_565]
+		movq	mm6, [g_mask_565]
+		movq	mm7, [b_mask_565]
+
+.yloop:
+		mov		ebp, [esp+20+16]
+		add		ebp, 6
+		jbe		.odd
+
+.xloop:
+		movq		mm0, [ecx+ebp]
+		movq		mm1, mm6
+		movq		mm2, mm7
+		pand		mm1, mm0
+		pand		mm2, mm0
+		pand		mm0, mm5
+
+		pmulhw		mm1, [x2080w]
+		psllq		mm2, 3
+		paddw		mm0, mm2
+		movq		mm2, mm0
+		psrlw		mm0, 5
+		pand		mm0, [x07b]
+		paddw		mm0, mm2
+		movq		mm2, mm0
+		punpcklbw	mm0, mm1
+		punpckhbw	mm2, mm1
+
+		movq	[edx+ebp*2], mm0
+		movq	[edx+ebp*2+8], mm2
+		add		ebp, 8
+		jnc		.xloop
+
+.odd:
+		sub		ebp, 6
+		jz		.noodd
+		push	edi
+.oddloop:
+		movzx	eax, word [ecx+ebp+6]
+		mov		ebx, 0000f800h
+		and		ebx, eax
+		mov		esi, eax
+		shl		ebx, 8
+		mov		edi, eax
+		shl		eax, 3
+		and		esi, 000007e0h
+		and		eax, 000000f8h
+		add		ebx, eax
+		shl		esi, 5
+		mov		eax, ebx
+		shr		ebx, 5
+		and		edi, 00000600h
+		shr		edi, 1
+		and		ebx, 00070007h
+		add		esi, edi
+		add		eax, ebx
+		add		eax, esi
+		mov		[edx+ebp*2+12], eax
+		add		ebp, 2
+		jnz		.oddloop
+		pop		edi
+.noodd:
+		add		ecx, [esp+16+16]
+		add		edx, [esp+8+16]
+		dec		edi
+		jne		.yloop
+
+		emms
+		epilogue
+		ret
+
+
+	global	_vdasm_pixblt_RGB888_to_XRGB8888_MMX
+_vdasm_pixblt_RGB888_to_XRGB8888_MMX:
+		prologue	6
+
+		mov		esi,[esp+12+16]
+		mov		edi,[esp+4+16]
+
+		mov		ecx,[esp+20+16]
+		lea		eax,[ecx+ecx*2]
+		lea		ebx,[ecx*4]
+		sub		[esp+8+16],ebx
+		sub		[esp+16+16],eax
+
+		mov		edx,[esp+24+16]
+		mov		ebx,[esp+20+16]
+		mov		ecx,[esp+16+16]
+		mov		eax,[esp+ 8+16]
+		
+		;ebx	horizontal count backup
+		;ecx	source modulo
+		;edx	vertical count
+		;esi	source
+		;edi	destination
+		;ebp	horizontal count
+	
+.yloop:
+		mov	ebp,ebx
+		shr	ebp,3
+		jz	.checkodd
+.xloop:
+		movq		mm0,[esi]		;mm0: g2b2r1g1b1r0g0b0
+		movq		mm1,mm0			;
+		
+		psrlq		mm1,24			;mm1: ------g2b2r1g1b1
+		movq		mm2,mm0			;
+		
+		movq		mm3,[esi+8]		;mm3: b5r4g4b4r3g3b3r2
+		punpckldq	mm0,mm1			;mm0: b2r1g1b1b1r0g0b0	[qword 0 ready]
+		
+		movq		mm4,mm3			;mm4: b5r4g4b4r3g3b3r2
+		psllq		mm3,48			;mm3: b3r2------------
+		
+		movq		mm5,mm4			;mm5: b5r4g4b4r3g3b3r2
+		psrlq		mm2,16			;mm2: ----g2b2--------
+		
+		movq		mm1,[esi+16]	;mm1: r7g7b7r6g6b6r5g5
+		por			mm2,mm3			;mm2: b3r2g2b2--------
+		
+		movq		[edi],mm0		;
+		psllq		mm4,24			;mm4: b4r3g3b3r2------
+		
+		movq		mm3,mm5			;mm3: b5r4g4b4r3g3b3r2
+		psrlq		mm5,24			;mm5: ------b5r4g4b4r3
+		
+		movq		mm0,mm1			;mm0: r7g7b7r6g6b6r5g5
+		psllq		mm1,40			;mm1: b6r5g5----------
+		
+		punpckhdq	mm2,mm4			;mm2: b4r3g3b3b3r2g2b2 [qword 1 ready]
+		por			mm1,mm5			;mm1: b6r5g5b5r4g4b4r3
+		
+		movq		mm4,mm0			;mm4: r7g7b7r6g6b6r5g5
+		punpckhdq	mm3,mm1			;mm3: b6r5g5b5b5r4g4b4 [qword 2 ready]
+		
+		movq		[edi+8],mm2
+		psrlq		mm0,16			;mm0: ----r7g7b7r6g6b6
+		
+		movq		[edi+16],mm3
+		psrlq		mm4,40			;mm4: ----------r7g7b7
+		
+		punpckldq	mm0,mm4			;mm0: --r7g7b7b7r6g6b6 [qword 3 ready]
+		add			esi,24
+		
+		movq		[edi+24],mm0
+			
+		add			edi,32
+		sub			ebp,1
+		jne			.xloop
+
+.checkodd:
+		mov			ebp,ebx
+		and			ebp,7
+		jz			.noodd
+		movd		mm7,eax
+.oddloop:
+		mov			ax,[esi]
+		mov			[edi],ax
+		mov			al,[esi+2]
+		mov			[edi+2],al
+		add			esi,3
+		add			edi,4
+		sub			ebp,1
+		jne			.oddloop
+		
+		movd		eax,mm7
+.noodd:
+		add			esi,ecx
+		add			edi,eax
+
+		sub			edx,1
+		jne			.yloop
+		emms
+		epilogue
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm
new file mode 100644
index 000000000..87ff13b56
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm
@@ -0,0 +1,161 @@
+		section	.rdata, rdata
+		
+		align	16
+
+bytemasks	dd		000000ffh, 0000ffffh, 00ffffffh
+
+		section	.text
+
+;============================================================================
+
+	global	_vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2
+_vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2:
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+4+12]
+		mov			ebx, [esp+8+12]
+		mov			ecx, [esp+12+12]
+		mov			edx, [esp+16+12]
+		mov			esi, [esp+20+12]
+		mov			edi, [esp+24+12]
+
+		pcmpeqb		xmm6, xmm6
+		psrlw		xmm6, 8				;xmm6 = 00FF x 8
+		
+		sub			esi, 4
+		js			.postcheck
+.xloop:
+		movdqu		xmm2, [edx]			;xmm0 = X3R3G3B3X2R2G2B2X1R1G1B1X0R0G0B0
+		add			edx, 16
+		movdqa		xmm5, xmm2
+		pand		xmm2, xmm6			;xmm0 =   R3  B3  R2  B2  R1  B1  R0  B0
+		psrlw		xmm5, 8				;xmm1 =   X3  G3  X2  G2  X1  G1  X0  G0
+		movdqa		xmm0, [edi+0]		;coeff_rb_to_y
+		movdqa		xmm1, [edi+16]		;coeff_rb_to_u
+		movdqa		xmm3, [edi+32]		;coeff_g_to_y
+		movdqa		xmm4, [edi+48]		;coeff_g_to_u
+		pmaddwd		xmm0, xmm2
+		pmaddwd		xmm1, xmm2
+		pmaddwd		xmm2, [edi+64]		;coeff_rb_to_v
+		pmaddwd		xmm3, xmm5
+		pmaddwd		xmm4, xmm5
+		pmaddwd		xmm5, [edi+80]		;coeff_g_to_v
+		paddd		xmm0, xmm3
+		paddd		xmm1, xmm4
+		paddd		xmm2, xmm5
+		paddd		xmm0, [edi+96]		;bias_y
+		paddd		xmm1, [edi+112]		;bias_c
+		paddd		xmm2, [edi+112]		;bias_c
+		psrad		xmm0, 15
+		psrad		xmm1, 15
+		psrad		xmm2, 15
+		packssdw	xmm0, xmm0
+		packssdw	xmm1, xmm1
+		packssdw	xmm2, xmm2
+		packuswb	xmm0, xmm0
+		packuswb	xmm1, xmm1
+		packuswb	xmm2, xmm2
+		movd		[eax], xmm0
+		movd		[ebx], xmm1
+		movd		[ecx], xmm2
+		add			eax, 4	
+		add			ebx, 4	
+		add			ecx, 4	
+		sub			esi, 4
+		jns			.xloop
+.postcheck:
+		jmp			dword [.finaltable + esi*4 + 16]
+.complete:
+		pop			ebx
+		pop			esi
+		pop			edi
+		ret		
+
+.finaltable:
+		dd			.complete
+		dd			.do1
+		dd			.do2
+		dd			.do3
+		
+.finaltable2:
+		dd			.fin1
+		dd			.fin2
+		dd			.fin3
+
+.do1:
+		movd		xmm2, [edx]
+		jmp			short .dofinal
+.do2:
+		movq		xmm2, [edx]
+		jmp			short .dofinal
+.do3:
+		movq		xmm2, [edx]
+		movd		xmm1, [edx]
+		movlhps		xmm2, xmm1
+.dofinal:
+		movdqa		xmm5, xmm2
+		pand		xmm2, xmm6			;xmm0 =   R3  B3  R2  B2  R1  B1  R0  B0
+		psrlw		xmm5, 8				;xmm1 =   X3  G3  X2  G2  X1  G1  X0  G0
+		movdqa		xmm0, [edi+0]		;coeff_rb_to_y
+		movdqa		xmm1, [edi+16]		;coeff_rb_to_u
+		movdqa		xmm3, [edi+32]		;coeff_g_to_y
+		movdqa		xmm4, [edi+48]		;coeff_g_to_u
+		pmaddwd		xmm0, xmm2
+		pmaddwd		xmm1, xmm2
+		pmaddwd		xmm2, [edi+64]		;coeff_rb_to_v
+		pmaddwd		xmm3, xmm5
+		pmaddwd		xmm4, xmm5
+		pmaddwd		xmm5, [edi+80]		;coeff_g_to_v
+		paddd		xmm0, xmm3
+		paddd		xmm1, xmm4
+		paddd		xmm2, xmm5
+		paddd		xmm0, [edi+96]		;bias_y
+		paddd		xmm1, [edi+112]		;bias_c
+		paddd		xmm2, [edi+112]		;bias_c
+		psrad		xmm0, 15
+		psrad		xmm1, 15
+		psrad		xmm2, 15
+		packssdw	xmm0, xmm0
+		packssdw	xmm1, xmm1
+		packssdw	xmm2, xmm2
+		packuswb	xmm0, xmm0
+		packuswb	xmm1, xmm1
+		movd		xmm7, [bytemasks + esi*4 + 12]
+		packuswb	xmm2, xmm2
+		
+		jmp			dword [.finaltable2 + esi*4 + 12]
+		
+.fin1:
+		movd		edx, xmm0
+		mov			[eax], dl
+		movd		edx, xmm1
+		mov			[ebx], dl
+		movd		edx, xmm2
+		mov			[ecx], dl
+		jmp			.complete
+.fin2:
+		movd		edx, xmm0
+		mov			[eax], dx
+		movd		edx, xmm1
+		mov			[ebx], dx
+		movd		edx, xmm2
+		mov			[ecx], dx
+		jmp			.complete
+.fin3:
+		movd		edx, xmm0
+		mov			[eax], dx
+		shr			edx, 16
+		mov			[eax+2], dl
+		movd		edx, xmm1
+		mov			[ebx], dx
+		shr			edx, 16
+		mov			[ebx+2], dl
+		movd		edx, xmm2
+		mov			[ecx], dx
+		shr			edx, 16
+		mov			[ecx+2], dl
+		jmp			.complete
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm
new file mode 100644
index 000000000..912c655ab
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm
@@ -0,0 +1,1559 @@
+;	VirtualDub - Video processing and capture application
+;	Graphics support library
+;	Copyright (C) 1998-2004 Avery Lee
+;
+;	This program is free software; you can redistribute it and/or modify
+;	it under the terms of the GNU General Public License as published by
+;	the Free Software Foundation; either version 2 of the License, or
+;	(at your option) any later version.
+;
+;	This program is distributed in the hope that it will be useful,
+;	but WITHOUT ANY WARRANTY; without even the implied warranty of
+;	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;	GNU General Public License for more details.
+;
+;	You should have received a copy of the GNU General Public License
+;	along with this program; if not, write to the Free Software
+;	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+;
+	section	.rdata, rdata, align=16
+
+x0002000200020002	dq	0002000200020002h
+x0004000400040004	dq	0004000400040004h
+x0008000800080008	dq	0008000800080008h
+x0000200000002000	dq	0000200000002000h
+
+	align 16
+MMX_roundval		dq	0000200000002000h, 0000200000002000h
+
+
+;**************************************************************************
+
+x0000FFFF0000FFFF	dq	0000FFFF0000FFFFh
+x0000010100000101	dq	0000010100000101h
+x0100010001000100	dq	0100010001000100h
+
+	section	.text
+
+;--------------------------------------------------------------------------
+;_vdasm_resize_interp_row_run_MMX(
+;	[esp+ 4] void *dst,
+;	[esp+ 8] void *src,
+;	[esp+12] ulong width,
+;	[esp+16] __int64 xaccum,
+;	[esp+24] __int64 x_inc);
+;
+	global	_vdasm_resize_interp_row_run_MMX
+_vdasm_resize_interp_row_run_MMX:
+	push	ebp
+	push	edi
+	push	esi
+	push	ebx
+
+	mov		esi, [esp+8+16]
+	mov		edi, [esp+4+16]
+	mov		ebp, [esp+12+16]
+
+	movd	mm4, dword [esp+16+16]
+	pxor	mm7, mm7
+	movd	mm6, dword [esp+24+16]
+	punpckldq mm4, mm4
+	punpckldq mm6, mm6
+
+	shr		esi, 2
+
+	mov		eax, [esp+16+16]
+	mov		ebx, [esp+20+16]
+	add		esi, ebx
+	mov		ebx, [esp+24+16]
+	mov		ecx, [esp+28+16]
+
+	shl		ebp,2
+	add		edi,ebp
+	neg		ebp
+
+.colloop:
+	movd		mm1, dword [esi*4+4]
+	movq		mm5, mm4
+
+	movd		mm0, dword [esi*4]
+	punpcklbw	mm1, mm7
+
+	punpcklbw	mm0, mm7
+	psrld		mm5, 24
+
+	movq		mm3, [x0100010001000100]
+	packssdw	mm5, mm5
+
+	pmullw		mm1, mm5
+	psubw		mm3, mm5
+
+	pmullw		mm0, mm3
+	paddd		mm4, mm6
+
+	;stall
+	;stall
+
+	;stall
+	;stall
+
+	paddw		mm0, mm1
+
+	psrlw		mm0, 8
+	add			eax, ebx
+
+	adc			esi, ecx
+	packuswb	mm0, mm0
+
+	movd		dword [edi+ebp],mm0
+
+	add			ebp, 4
+	jnz			.colloop
+
+	pop		ebx
+	pop		esi
+	pop		edi
+	pop		ebp
+	ret
+
+
+
+;**************************************************************************
+
+;vdasm_resize_interp_col_run_MMX(
+;	[esp+ 4] void *dst,
+;	[esp+ 8] void *src1,
+;	[esp+12] void *src2,
+;	[esp+16] ulong width,
+;	[esp+20] ulong yaccum);
+
+
+	global	_vdasm_resize_interp_col_run_MMX
+_vdasm_resize_interp_col_run_MMX:
+	push	ebp
+	push	edi
+	push	esi
+	push	ebx
+
+	mov		esi, [esp+8+16]
+	mov		edx, [esp+12+16]
+	mov		edi, [esp+4+16]
+	mov		ebp, [esp+16+16]
+
+	movd	mm4, dword [esp+20+16]
+	pxor	mm7, mm7
+	punpcklwd mm4, mm4
+	punpckldq mm4, mm4
+	psrlw	mm4, 8
+	pxor	mm4, [x0000FFFF0000FFFF]
+	paddw	mm4, [x0000010100000101]
+
+	shl		ebp, 2
+	add		edi, ebp
+	add		esi, ebp
+	add		edx, ebp
+	neg		ebp
+
+.colloop:
+	movd	mm0, dword [esi+ebp]
+	movd	mm2, dword [edx+ebp]
+
+	punpcklbw	mm0, mm7
+	punpcklbw	mm2, mm7
+
+	movq		mm1, mm0
+	punpcklwd	mm0, mm2
+	punpckhwd	mm1, mm2
+
+	pmaddwd		mm0, mm4
+	pmaddwd		mm1, mm4
+
+	psrad		mm0, 8
+	psrad		mm1, 8
+
+	packssdw	mm0, mm1
+	packuswb	mm0, mm0
+
+	movd		dword [edi+ebp],mm0
+
+	add			ebp, 4
+	jnz			.colloop
+
+	pop		ebx
+	pop		esi
+	pop		edi
+	pop		ebp
+	ret
+
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_row_MMX(dst, src, count, xaccum, xinc, tbl);
+
+	global	_vdasm_resize_ccint_row_MMX
+_vdasm_resize_ccint_row_MMX:
+	push	ebx
+	push	esi
+	push	edi
+	push	ebp
+
+	mov		ebx, [esp+4+16]		;ebx = dest addr
+	mov		ecx, [esp+12+16]	;ecx = count
+
+	mov		ebp, [esp+20+16]	;ebp = increment
+	mov		edi, ebp			;edi = increment
+	shl		ebp, 16				;ebp = fractional increment
+	mov		esi, [esp+16+16]	;esi = 16:16 position
+	sar		edi, 16				;edi = integer increment
+	mov		[esp+20+16], ebp	;xinc = fractional increment
+	mov		ebp, esi			;ebp = 16:16 position
+	shr		esi, 16				;esi = integer position
+	shl		ebp, 16				;ebp = fraction
+	mov		[esp+16+16], ebp	;xaccum = fraction
+
+	mov		eax, [esp+8+16]
+
+	shr		ebp, 24				;ebp = fraction (0...255)
+	mov		[esp+8+16], edi
+	shl		ebp, 4				;ebp = fraction*16
+	mov		edi, ebp
+	mov		ebp, [esp+4+16]		;ebp = destination
+
+	shr		eax, 2
+	add		eax, esi
+	shl		ecx, 2				;ecx = count*4
+	lea		ebp, [ebp+ecx-4]
+	neg		ecx					;ecx = -count*4
+
+	movq		mm6, [x0000200000002000]
+	pxor		mm7, mm7
+
+	mov			edx,[esp+16+16]		;edx = fractional accumulator
+	mov			esi,[esp+20+16]		;esi = fractional increment
+
+	mov			ebx,[esp+24+16]		;ebx = coefficient pointer
+
+	movd		mm0,dword [eax*4]
+	movd		mm1,dword [eax*4+4]
+	punpcklbw	mm0,mm7				;mm0 = [a1][r1][g1][b1]
+
+	;borrow stack pointer
+	push		0					;don't crash
+	push		dword [fs:0]
+	mov			dword [fs:0], esp
+	mov			esp, [esp+8+24]		;esp = integer increment
+	jmp			short ccint_loop_MMX_start
+
+	;EAX	source pointer / 4
+	;EBX	coefficient pointer
+	;ECX	count
+	;EDX	fractional accumulator
+	;ESI	fractional increment
+	;EDI	coefficient offset
+	;ESP	integer increment
+	;EBP	destination pointer
+
+	align		16
+ccint_loop_MMX:
+	movd		mm0,dword [eax*4]
+	packuswb	mm2,mm2				;mm0 = [a][r][g][b][a][r][g][b]
+
+	movd		mm1,dword [eax*4+4]
+	punpcklbw	mm0,mm7				;mm0 = [a1][r1][g1][b1]
+
+	movd		dword [ebp+ecx],mm2
+ccint_loop_MMX_start:
+	movq		mm4,mm0				;mm0 = [a1][r1][g1][b1]
+
+	movd		mm2,dword [eax*4+8]
+	punpcklbw	mm1,mm7				;mm1 = [a2][r2][g2][b2]
+
+	movd		mm3,dword [eax*4+12]
+	punpcklbw	mm2,mm7				;mm2 = [a3][r3][g3][b3]
+
+	punpcklbw	mm3,mm7				;mm3 = [a4][r4][g4][b4]
+	movq		mm5,mm2				;mm2 = [a3][r3][g3][b3]
+
+	add			edx,esi				;add fractional increment
+	punpcklwd	mm0,mm1				;mm0 = [g2][g1][b2][b1]
+
+	pmaddwd		mm0,[ebx+edi]
+	punpcklwd	mm2,mm3				;mm2 = [g4][g3][b4][b3]
+
+	pmaddwd		mm2,[ebx+edi+8]
+	punpckhwd	mm4,mm1				;mm4 = [a2][a1][r2][r1]
+
+	pmaddwd		mm4,[ebx+edi]
+	punpckhwd	mm5,mm3				;mm5 = [a4][a3][b4][b3]
+
+	pmaddwd		mm5,[ebx+edi+8]
+	paddd		mm0,mm6
+
+	adc			eax,esp				;add integer increment and fractional bump to offset
+	mov			edi,0ff000000h
+
+	paddd		mm2,mm0				;mm0 = [ g ][ b ]
+	paddd		mm4,mm6
+
+	psrad		mm2,14
+	paddd		mm4,mm5				;mm4 = [ a ][ r ]
+
+	and			edi,edx
+	psrad		mm4,14
+
+	shr			edi,20				;edi = fraction (0...255)*16
+	add			ecx,4
+
+	packssdw	mm2,mm4				;mm0 = [ a ][ r ][ g ][  b ]
+	jnc			ccint_loop_MMX
+
+	packuswb	mm2,mm2				;mm0 = [a][r][g][b][a][r][g][b]
+	movd		dword [ebp],mm2
+
+	mov		esp, dword [fs:0]
+	pop		dword [fs:0]
+	pop		eax
+
+	pop		ebp
+	pop		edi
+	pop		esi
+	pop		ebx
+	ret
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_col_MMX(dst, src1, src2, src3, src4, count, tbl);
+
+	global	_vdasm_resize_ccint_col_MMX
+_vdasm_resize_ccint_col_MMX:
+	push	ebx
+	push	esi
+	push	edi
+	push	ebp
+
+	mov		ebp, [esp+4+16]	;ebp = dest addr
+	mov		esi, [esp+24+16]	;esi = count
+	add		esi, esi
+	add		esi, esi
+
+	mov		eax, [esp+8+16]	;eax = row 1
+	mov		ebx, [esp+12+16]	;ebx = row 2
+	mov		ecx, [esp+16+16]	;ecx = row 3
+	mov		edx, [esp+20+16]	;edx = row 4
+	mov		edi, [esp+28+16]	;edi = coefficient ptr
+	
+	add		eax, esi
+	add		ebx, esi
+	add		ecx, esi
+	add		edx, esi
+	add		ebp, esi
+	neg		esi
+
+	movq		mm4,[edi]
+	movq		mm5,[edi+8]
+	movq		mm6,[x0000200000002000]
+	pxor		mm7,mm7
+
+	movd		mm2,dword [eax+esi]
+	movd		mm1,dword [ebx+esi]		;mm1 = pixel1
+	punpcklbw	mm2,mm7
+	jmp		short ccint_col_loop_MMX.entry
+
+	align		16
+ccint_col_loop_MMX:
+	movd		mm2,dword [eax+esi]		;mm2 = pixel0
+	packuswb	mm0,mm0
+	
+	movd		mm1,dword [ebx+esi]		;mm1 = pixel1
+	pxor		mm7,mm7
+
+	movd		dword [ebp+esi-4],mm0
+	punpcklbw	mm2,mm7
+	
+ccint_col_loop_MMX.entry:	
+	punpcklbw	mm1,mm7
+	movq		mm0,mm2
+	
+	movd		mm3,dword [edx+esi]		;mm3 = pixel3
+	punpcklwd	mm0,mm1			;mm0 = [g1][g0][b1][b0]
+	
+	pmaddwd		mm0,mm4
+	punpckhwd	mm2,mm1			;mm2 = [a1][a0][r1][r0]
+	
+	movd		mm1,dword [ecx+esi]		;mm1 = pixel2
+	punpcklbw	mm3,mm7
+		
+	pmaddwd		mm2,mm4
+	punpcklbw	mm1,mm7
+	
+	movq		mm7,mm1
+	punpcklwd	mm1,mm3			;mm1 = [g3][g2][b3][b2]
+	
+	punpckhwd	mm7,mm3			;mm7 = [a3][a2][r3][r2]
+	pmaddwd		mm1,mm5
+	
+	pmaddwd		mm7,mm5
+	paddd		mm0,mm6
+	
+	paddd		mm2,mm6
+	paddd		mm0,mm1
+	
+	paddd		mm2,mm7
+	psrad		mm0,14
+	
+	psrad		mm2,14
+	add			esi,4
+	
+	packssdw	mm0,mm2
+	jne			ccint_col_loop_MMX
+	
+	packuswb	mm0,mm0
+	movd		dword [ebp-4],mm0
+
+	pop		ebp
+	pop		edi
+	pop		esi
+	pop		ebx
+	ret
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_col_SSE2(dst, src1, src2, src3, src4, count, tbl);
+
+	global	_vdasm_resize_ccint_col_SSE2
+_vdasm_resize_ccint_col_SSE2:
+	push	ebx
+	push	esi
+	push	edi
+	push	ebp
+
+	mov	ebp,[esp + 4 + 16]	;ebp = dest addr
+	mov	esi,[esp + 24 + 16]	;esi = count
+	add	esi,esi
+	add	esi,esi
+
+	mov	eax,[esp + 8 + 16]	;eax = row 1
+	mov	ebx,[esp + 12 + 16]	;ebx = row 2
+	mov	ecx,[esp + 16 + 16]	;ecx = row 3
+	mov	edx,[esp + 20 + 16]	;edx = row 4
+	mov	edi,[esp + 28 + 16]	;edi = coefficient ptr
+	
+	neg	esi
+
+	add	esi,4
+	jz	ccint_col_SSE2_odd
+
+	movq		xmm4,qword [edi]
+	movq		xmm5,qword [edi+8]
+	punpcklqdq	xmm4,xmm4
+	punpcklqdq	xmm5,xmm5
+	movq		xmm6,[x0000200000002000]
+	punpcklqdq	xmm6,xmm6
+	pxor		xmm7,xmm7
+
+;	jmp		short ccint_col_loop_SSE2.entry
+
+;	align		16
+ccint_col_loop_SSE2:
+	movq		xmm0, qword [eax]
+	add			eax, 8
+	movq		xmm1, qword [ebx]
+	add			ebx, 8
+	movq		xmm2, qword [ecx]
+	add			ecx, 8
+	movq		xmm3, qword [edx]
+	add			edx, 8
+	punpcklbw	xmm0,xmm1
+	punpcklbw	xmm2,xmm3
+	movdqa		xmm1,xmm0
+	movdqa		xmm3,xmm2
+	punpcklbw	xmm0,xmm7
+	punpckhbw	xmm1,xmm7
+	punpcklbw	xmm2,xmm7
+	punpckhbw	xmm3,xmm7
+	pmaddwd		xmm0,xmm4
+	pmaddwd		xmm1,xmm4
+	pmaddwd		xmm2,xmm5
+	pmaddwd		xmm3,xmm5
+	paddd		xmm0,xmm6
+	paddd		xmm1,xmm6
+	paddd		xmm0,xmm2
+	paddd		xmm1,xmm3
+	psrad		xmm0,14
+	psrad		xmm1,14
+	packssdw	xmm0,xmm1
+	packuswb	xmm0,xmm0
+	movdq2q		mm0,xmm0	
+	movntq		[ebp],mm0
+	add		ebp,8
+	add		esi,8
+	jnc		ccint_col_loop_SSE2
+	jnz		ccint_col_SSE2_noodd
+ccint_col_SSE2_odd:
+	movd		mm0, dword [eax]
+	pxor		mm7,mm7
+	movd		mm1, dword [ebx]
+	movdq2q		mm4,xmm4
+	movd		mm2, dword [ecx]
+	movdq2q		mm5,xmm5
+	movd		mm3, dword [edx]
+	movdq2q		mm6,xmm6
+	punpcklbw	mm0,mm1
+	punpcklbw	mm2,mm3
+	movq		mm1,mm0
+	movq		mm3,mm2
+	punpcklbw	mm0,mm7
+	punpckhbw	mm1,mm7
+	punpcklbw	mm2,mm7
+	punpckhbw	mm3,mm7
+	pmaddwd		mm0,mm4
+	pmaddwd		mm1,mm4
+	pmaddwd		mm2,mm5
+	pmaddwd		mm3,mm5
+	paddd		mm0,mm6
+	paddd		mm2,mm6
+	paddd		mm0,mm2
+	paddd		mm1,mm3
+	psrad		mm0,14
+	psrad		mm1,14
+	packssdw	mm0,mm1
+	packuswb	mm0,mm0
+	movd		eax,mm0
+	movnti		[ebp],eax
+
+ccint_col_SSE2_noodd:
+	pop		ebp
+	pop		edi
+	pop		esi
+	pop		ebx
+	ret
+
+
+
+;-------------------------------------------------------------------------
+;
+;	long resize_table_row_MMX(Pixel *out, Pixel *in, int *filter, int filter_width, PixDim w, long accum, long frac);
+
+	.code
+
+	global	_vdasm_resize_table_row_MMX
+_vdasm_resize_table_row_MMX:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+
+	cmp		dword [esp+16+16], 4
+	jz		.accel_4coeff
+	cmp		dword [esp+16+16], 6
+	jz		.accel_6coeff
+	cmp		dword [esp+16+16], 8
+	jz		.accel_8coeff
+
+	mov	eax,[esp + 24 + 16]
+	mov	ebp,[esp + 20 + 16]
+	mov	ebx,[esp + 8 + 16]
+	mov	edi,[esp + 4 + 16]
+
+	mov	esi,eax
+	mov	edx,eax
+
+	pxor		mm5,mm5
+
+	mov		ecx,[esp + 16 + 16]
+	shr		ecx,1
+	mov		[esp+16+16],ecx
+	test	ecx,1
+	jnz		.pixelloop_odd_pairs
+
+.pixelloop_even_pairs:
+	shr		esi,14
+	and		edx,0000ff00h
+	and		esi,byte -4
+
+	mov		ecx,[esp + 16 + 16]
+	shr		edx,5
+	add		esi,ebx
+	imul	edx,ecx
+	add		eax,[esp + 28 + 16]
+	add		edx,[esp + 12 + 16]
+
+	movq	mm6,[MMX_roundval]
+	pxor	mm3,mm3
+	movq	mm7,mm6
+	pxor	mm2,mm2
+
+.coeffloop_unaligned_even_pairs:
+	movd		mm0,dword [esi+0]
+	paddd		mm7,mm2			;accumulate alpha/red (pixels 2/3)
+
+	punpcklbw	mm0,[esi+4]		;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+	paddd		mm6,mm3			;accumulate green/blue (pixels 2/3)
+
+	movd		mm2,dword [esi+8]
+	movq		mm1,mm0			;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpcklbw	mm2,[esi+12]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	punpckhbw	mm0,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	pmaddwd		mm0,[edx]		;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm1,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm1,[edx]		;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+8]		;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+8]		;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm7,mm0			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm6,mm1			;accumulate green/blue (pixels 0/1)
+	add		edx,16
+
+	add		esi,16
+	sub		ecx,2
+
+	jne		.coeffloop_unaligned_even_pairs
+
+	paddd		mm7,mm2			;accumulate alpha/red (pixels 2/3)
+	paddd		mm6,mm3			;accumulate green/blue (pixels 2/3)
+
+	psrad		mm7,14
+	psrad		mm6,14
+
+	packssdw	mm6,mm7
+	add		edi,4
+
+	packuswb	mm6,mm6
+	sub		ebp,1
+
+	mov	esi,eax
+	mov	edx,eax
+
+	movd	dword [edi-4],mm6
+	jne	.pixelloop_even_pairs
+
+	pop	ebx
+	pop	edi
+	pop	esi
+	pop	ebp
+
+	ret
+
+;----------------------------------------------------------------
+
+.pixelloop_odd_pairs:
+	shr		esi,14
+	and		edx,0000ff00h
+	and		esi,byte -4
+
+	mov		ecx,[esp + 16 + 16]
+	shr		edx,5
+	add		esi,ebx
+	imul	edx,ecx
+	add		eax,[esp + 28 + 16]
+	sub		ecx,1
+	add		edx,[esp + 12 + 16]
+
+	movq	mm6,[MMX_roundval]
+	pxor	mm3,mm3
+	pxor	mm2,mm2
+	movq	mm7,mm6
+
+.coeffloop_unaligned_odd_pairs:
+	movd		mm0,dword [esi+0]
+	paddd		mm7,mm2			;accumulate alpha/red (pixels 2/3)
+
+	punpcklbw	mm0,[esi+4]		;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+	paddd		mm6,mm3			;accumulate green/blue (pixels 2/3)
+
+	movd		mm2,dword [esi+8]
+	movq		mm1,mm0			;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpcklbw	mm2,[esi+12]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	punpckhbw	mm0,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	pmaddwd		mm0,[edx]		;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm1,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm1,[edx]		;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+8]		;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+8]		;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm7,mm0			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm6,mm1			;accumulate green/blue (pixels 0/1)
+	add		edx,16
+
+	add		esi,16
+	sub		ecx,2
+
+	jne		.coeffloop_unaligned_odd_pairs
+
+	paddd		mm7,mm2			;accumulate alpha/red (pixels 2/3)
+	paddd		mm6,mm3			;accumulate green/blue (pixels 2/3)
+
+	;finish up odd pair
+
+	movd		mm0,dword [esi]		;mm0 = [x1][r1][g1][b1]
+	punpcklbw	mm0,[esi+4]		;mm2 = [x0][x1][r0][r1][g0][g1][b0][b1]
+	movq		mm1,mm0
+	punpcklbw	mm0,mm5			;mm0 = [g0][g1][b0][b1]
+	punpckhbw	mm1,mm5			;mm1 = [x0][x1][r0][r1]
+
+	pmaddwd		mm0,[edx]
+	pmaddwd		mm1,[edx]
+
+	paddd		mm6,mm0
+	paddd		mm7,mm1
+
+	;combine into pixel
+
+	psrad		mm6,14
+
+	psrad		mm7,14
+
+	packssdw	mm6,mm7
+	add		edi,4
+
+	packuswb	mm6,mm6
+	sub		ebp,1
+
+	mov		esi,eax
+	mov		edx,eax
+
+	movd		dword [edi-4],mm6
+	jne		.pixelloop_odd_pairs
+
+	pop	ebx
+	pop	edi
+	pop	esi
+	pop	ebp
+
+	ret
+
+;----------------------------------------------------------------
+
+.accel_4coeff:
+	mov	eax,[esp + 24 + 16]
+	mov	ebp,[esp + 20 + 16]
+	add	ebp,ebp
+	add	ebp,ebp
+	mov	ebx,[esp + 8 + 16]
+	mov	edi,[esp + 4 + 16]
+	add	edi,ebp
+	neg	ebp
+
+	mov	esi,eax
+	mov	edx,eax
+
+	movq		mm4,[MMX_roundval]
+	pxor		mm5,mm5
+
+	mov		ecx,[esp+12+16]
+
+.pixelloop_4coeff:
+	shr		esi,14
+	and		edx,0000ff00h
+	and		esi,byte -4
+
+	shr		edx,4
+	add		esi,ebx
+	add		eax,[esp+28+16]
+	add		edx,ecx
+
+	movd		mm0,dword [esi+0]
+	movd		mm2,dword [esi+8]
+	punpcklbw	mm0,[esi+4]		;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm1,mm0			;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpckhbw	mm0,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+	pmaddwd		mm0,[edx]		;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm2,[esi+12]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+	punpcklbw	mm1,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm1,[edx]		;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+8]		;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+8]		;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm0,mm4			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm4			;accumulate green/blue (pixels 0/1)
+
+	paddd		mm0,mm2			;accumulate alpha/red (pixels 2/3)
+	paddd		mm1,mm3			;accumulate green/blue (pixels 2/3)
+
+	psrad		mm0,14
+	psrad		mm1,14
+
+	packssdw	mm1,mm0
+	mov	esi,eax
+
+	packuswb	mm1,mm1
+	mov	edx,eax
+
+	movd	dword [edi+ebp],mm1
+	add		ebp,4
+	jne		.pixelloop_4coeff
+
+	pop	ebx
+	pop	edi
+	pop	esi
+	pop	ebp
+
+	ret
+
+
+;----------------------------------------------------------------
+
+.accel_6coeff:
+	mov	eax,[esp + 24 + 16]
+	mov	ebp,[esp + 20 + 16]
+	add	ebp,ebp
+	add	ebp,ebp
+	mov	ebx,[esp + 8 + 16]
+	mov	edi,[esp + 4 + 16]
+	add	edi,ebp
+	neg	ebp
+
+	mov	esi,eax
+	mov	edx,eax
+
+	movq		mm4,[MMX_roundval]
+	pxor		mm5,mm5
+
+	mov		ecx,[esp+12+16]
+
+.pixelloop_6coeff:
+	shr		esi,14
+	and		edx,0000ff00h
+	and		esi,byte -4
+
+	shr		edx,5
+	lea		edx,[edx+edx*2]
+	add		esi,ebx
+	add		eax,[esp+28+16]
+	add		edx,ecx
+
+	movd		mm0,dword [esi+0]
+	movd		mm2,dword [esi+8]
+	punpcklbw	mm0,[esi+4]		;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm1,mm0			;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpckhbw	mm0,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+	pmaddwd		mm0,[edx]		;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm2,[esi+12]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+	punpcklbw	mm1,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm1,[edx]		;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+8]		;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+8]		;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm0,mm4			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm4			;accumulate green/blue (pixels 0/1)
+
+	paddd		mm0,mm2			;accumulate alpha/red (pixels 2/3)
+	paddd		mm1,mm3			;accumulate green/blue (pixels 2/3)
+
+	movd		mm6,dword [esi+16]
+
+	punpcklbw	mm6,[esi+20]	;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm7,mm6			;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpckhbw	mm6,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+	pmaddwd		mm6,[edx+16]	;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm7,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm7,[edx+16]	;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	paddd		mm0,mm6			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm7			;accumulate green/blue (pixels 0/1)
+
+
+	psrad		mm0,14
+	psrad		mm1,14
+
+	packssdw	mm1,mm0
+	mov	esi,eax
+
+	packuswb	mm1,mm1
+	mov	edx,eax
+
+	movd	dword [edi+ebp],mm1
+	add		ebp,4
+	jne		.pixelloop_6coeff
+
+	pop	ebx
+	pop	edi
+	pop	esi
+	pop	ebp
+
+	ret
+
+;----------------------------------------------------------------
+
+.accel_8coeff:
+	mov	eax,[esp + 24 + 16]
+	mov	ebp,[esp + 20 + 16]
+	add	ebp,ebp
+	add	ebp,ebp
+	mov	ebx,[esp + 8 + 16]
+	mov	edi,[esp + 4 + 16]
+	add	edi,ebp
+	neg	ebp
+
+	mov	esi,eax
+	mov	edx,eax
+
+	movq		mm4,[MMX_roundval]
+	pxor		mm5,mm5
+
+	mov		ecx,[esp+12+16]
+
+.pixelloop_8coeff:
+	shr		esi,14
+	and		edx,0000ff00h
+	and		esi,byte -4
+
+	shr		edx,3
+	add		esi,ebx
+	add		eax,[esp+28+16]
+	add		edx,ecx
+
+	movd		mm0,dword [esi+0]
+	movd		mm2,dword [esi+8]
+	punpcklbw	mm0,[esi+4]		;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm1,mm0			;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpckhbw	mm0,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+	pmaddwd		mm0,[edx]		;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm2,[esi+12]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+	punpcklbw	mm1,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm1,[edx]		;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+8]		;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+8]		;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm0,mm4			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm4			;accumulate green/blue (pixels 0/1)
+
+	paddd		mm0,mm2			;accumulate alpha/red (pixels 2/3)
+	paddd		mm1,mm3			;accumulate green/blue (pixels 2/3)
+
+
+	movd		mm6,dword [esi+16]
+
+	punpcklbw	mm6,[esi+20]	;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	movd		mm2,dword [esi+24]
+
+	punpcklbw	mm2,[esi+28]	;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+	movq		mm7,mm6			;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+	punpckhbw	mm6,mm5			;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+	movq		mm3,mm2			;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+	pmaddwd		mm6,[edx+16]	;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+	punpcklbw	mm7,mm5			;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+	pmaddwd		mm7,[edx+16]	;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+	punpckhbw	mm2,mm5			;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+	pmaddwd		mm2,[edx+24]	;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+	punpcklbw	mm3,mm5			;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+	pmaddwd		mm3,[edx+24]	;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+	paddd		mm0,mm6			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm7			;accumulate green/blue (pixels 0/1)
+	paddd		mm0,mm2			;accumulate alpha/red (pixels 0/1)
+
+	paddd		mm1,mm3			;accumulate green/blue (pixels 0/1)
+
+
+	psrad		mm0,14
+	psrad		mm1,14
+
+	packssdw	mm1,mm0
+	mov	esi,eax
+
+	packuswb	mm1,mm1
+	mov	edx,eax
+
+	movd	dword [edi+ebp],mm1
+	add		ebp,4
+	jne		.pixelloop_8coeff
+
+	pop	ebx
+	pop	edi
+	pop	esi
+	pop	ebp
+
+	ret
+
+
+
+
+
+
+
+;-------------------------------------------------------------------------
+;
+;	long resize_table_col_MMX(Pixel *out, Pixel **in_table, int *filter, int filter_width, PixDim w, long frac);
+
+	global	_vdasm_resize_table_col_MMX
+_vdasm_resize_table_col_MMX:
+	push		ebp
+	push		esi
+	push		edi
+	push		ebx
+
+	mov			edx,[esp + 12 + 16]
+	mov			eax,[esp + 24 + 16]
+	shl			eax,2
+	imul		eax,[esp + 16 + 16]
+	add			edx,eax
+	mov			[esp + 12 + 16], edx	;[esp+12+28] = filter pointer
+
+	mov			ebp,[esp + 20 + 16]		;ebp = pixel counter
+	mov			edi,[esp + 4 + 16]		;edi = destination pointer
+
+	pxor		mm5,mm5
+
+	cmp			dword [esp+16+16], 4
+	jz			.accel_4coeff
+	cmp			dword [esp+16+16], 6
+	jz			.accel_6coeff
+
+	mov			ecx,[esp + 16 + 16]
+	shr			ecx,1
+	mov			[esp + 16 + 16],ecx		;ecx = filter pair count
+
+	xor			ebx,ebx					;ebx = source offset 
+
+	mov			ecx,[esp + 16 + 16]		;ecx = filter width counter
+.pixelloop:
+	mov			eax,[esp + 8 + 16]		;esi = row pointer table
+	movq		mm6,[MMX_roundval]
+	movq		mm7,mm6
+	pxor		mm0,mm0
+	pxor		mm1,mm1
+.coeffloop:
+	mov			esi,[eax]
+	paddd		mm6,mm0
+
+	movd		mm0,dword [esi+ebx]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+	paddd		mm7,mm1
+
+	mov			esi,[eax+4]
+	add			eax,8
+
+	movd		mm1,dword [esi+ebx]	;mm1 = [0][0][0][0][x1][r1][g1][b1]
+	punpcklbw	mm0,mm1			;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm1,mm0
+	punpcklbw	mm0,mm5			;mm0 = [g1][g0][b1][b0]
+
+	pmaddwd		mm0,[edx]
+	punpckhbw	mm1,mm5			;mm1 = [x1][x0][r1][r0]
+
+	pmaddwd		mm1,[edx]
+	add			edx,8
+
+	sub			ecx,1
+	jne			.coeffloop
+
+	paddd		mm6,mm0
+	paddd		mm7,mm1
+
+	psrad		mm6,14
+	psrad		mm7,14
+	add			edi,4
+	packssdw	mm6,mm7
+	add			ebx,4
+	packuswb	mm6,mm6
+	sub			ebp,1
+
+	mov			ecx,[esp + 16 + 16]		;ecx = filter width counter
+	mov			edx,[esp + 12 + 16]		;edx = filter bank pointer
+
+	movd		dword [edi-4],mm6
+	jne			.pixelloop
+
+.xit:
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+	ret
+
+
+
+.accel_4coeff:
+	movq		mm2,[edx]
+	movq		mm3,[edx+8]
+
+	mov			esi,[esp+8+16]			;esi = row pointer table
+	mov			eax,[esi]
+	add			ebp,ebp
+	mov			ebx,[esi+4]
+	add			ebp,ebp
+	mov			ecx,[esi+8]
+	mov			esi,[esi+12]
+	add			eax,ebp
+	add			ebx,ebp
+	add			ecx,ebp
+	add			esi,ebp
+	add			edi,ebp
+	neg			ebp
+
+	;EAX	source 0
+	;EBX	source 1
+	;ECX	source 2
+	;ESI	source 3
+	;EDI	destination
+	;EBP	counter
+
+	movq		mm4,[MMX_roundval]
+
+.pixelloop4:
+	movd		mm6,dword [eax+ebp]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+
+	punpcklbw	mm6,[ebx+ebp]	;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm7,mm6
+	punpcklbw	mm6,mm5			;mm0 = [g1][g0][b1][b0]
+
+	pmaddwd		mm6,mm2
+	punpckhbw	mm7,mm5			;mm1 = [x1][x0][r1][r0]
+
+	movd		mm0,dword [ecx+ebp]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+	pmaddwd		mm7,mm2
+
+	punpcklbw	mm0,[esi+ebp]	;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+	paddd		mm6,mm4
+
+	movq		mm1,mm0
+	punpcklbw	mm0,mm5			;mm0 = [g1][g0][b1][b0]
+
+	pmaddwd		mm0,mm3
+	punpckhbw	mm1,mm5			;mm1 = [x1][x0][r1][r0]
+
+	pmaddwd		mm1,mm3
+	paddd		mm7,mm4
+
+	paddd		mm6,mm0
+	paddd		mm7,mm1
+
+	psrad		mm6,14
+	psrad		mm7,14
+	packssdw	mm6,mm7
+	packuswb	mm6,mm6
+
+	movd		dword [edi+ebp],mm6
+
+	add			ebp,4
+	jne			.pixelloop4
+	jmp			.xit
+
+.accel_6coeff:
+	movq		mm2,[edx]
+	movq		mm3,[edx+8]
+	movq		mm4,[edx+16]
+
+	push		0
+	push		dword [fs:0]
+	mov			dword [fs:0],esp
+
+	mov			esp,[esp+8+24]			;esp = row pointer table
+	mov			eax,[esp]
+	add			ebp,ebp
+	mov			ebx,[esp+4]
+	add			ebp,ebp
+	mov			ecx,[esp+8]
+	mov			edx,[esp+12]
+	mov			esi,[esp+16]
+	mov			esp,[esp+20]
+	add			eax,ebp
+	add			ebx,ebp
+	add			ecx,ebp
+	add			edx,ebp
+	add			esi,ebp
+	add			edi,ebp
+	add			esp,ebp
+	neg			ebp
+
+	;EAX	source 0
+	;EBX	source 1
+	;ECX	source 2
+	;EDX	source 3
+	;ESI	source 4
+	;EDI	destination
+	;ESP	source 5
+	;EBP	counter
+
+.pixelloop6:
+	movd		mm6,dword [eax+ebp]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+
+	punpcklbw	mm6,[ebx+ebp]	;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+	movq		mm7,mm6
+	punpcklbw	mm6,mm5			;mm0 = [g1][g0][b1][b0]
+
+	movd		mm0,dword [ecx+ebp]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+	punpckhbw	mm7,mm5			;mm1 = [x1][x0][r1][r0]
+
+	punpcklbw	mm0,[edx+ebp]	;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+	pmaddwd		mm6,mm2
+
+	movq		mm1,mm0
+	punpcklbw	mm0,mm5			;mm0 = [g1][g0][b1][b0]
+
+	pmaddwd		mm7,mm2
+	punpckhbw	mm1,mm5			;mm1 = [x1][x0][r1][r0]
+
+	paddd		mm6,[MMX_roundval]
+	pmaddwd		mm0,mm3
+
+	paddd		mm7,[MMX_roundval]
+	pmaddwd		mm1,mm3
+
+	paddd		mm6,mm0
+
+	movd		mm0,dword [esi+ebp]	;mm0 = [0][0][0][0][x0][r0][g0][b0]
+	paddd		mm7,mm1
+
+	punpcklbw	mm0,[esp+ebp]	;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+	movq		mm1,mm0
+	punpcklbw	mm0,mm5			;mm0 = [g1][g0][b1][b0]
+	punpckhbw	mm1,mm5			;mm1 = [x1][x0][r1][r0]
+	pmaddwd		mm0,mm4
+	pmaddwd		mm1,mm4
+	paddd		mm6,mm0
+	paddd		mm7,mm1
+
+	psrad		mm6,14
+	psrad		mm7,14
+	packssdw	mm6,mm7
+	packuswb	mm6,mm6
+
+	movd		dword [edi+ebp],mm6
+
+	add			ebp,4
+	jne			.pixelloop6
+
+	mov			esp, dword [fs:0]
+	pop			dword [fs:0]
+	pop			eax
+
+	jmp			.xit
+
+
+	global		_vdasm_resize_table_col_SSE2
+_vdasm_resize_table_col_SSE2:
+	push		ebp
+	push		esi
+	push		edi
+	push		ebx
+
+	mov			edx,[esp+12+16]
+	mov			eax,[esp+24+16]
+	shl			eax,2
+	imul		eax,[esp+16+16]
+	add			edx,eax
+	mov			[esp+12+16], edx		;[esp+12+16] = filter pointer
+
+	mov			ebp,[esp+20+16]		;ebp = pixel counter
+	mov			edi,[esp+4+16]		;edi = destination pointer
+
+	pxor		xmm7, xmm7
+	movdqa		xmm6, [MMX_roundval]
+
+	cmp			dword [esp+16+16], 4
+	jz			.accel_4coeff
+	cmp			dword [esp+16+16], 6
+	jz			.accel_6coeff
+
+	mov			ecx,[esp+16+16]
+	shr			ecx,1
+	mov			[esp+16+16],ecx		;ecx = filter pair count
+
+	xor			ebx,ebx					;ebx = source offset 
+
+	mov			ecx,[esp+16+16]		;ecx = filter width counter
+.pixelloop:
+	mov			eax, [esp+8+16]		;esi = row pointer table
+	movdqa		xmm4, xmm6
+.coeffloop:
+	mov			esi,[eax]
+
+	movd		xmm0, dword [esi+ebx]
+
+	mov			esi,[eax+4]
+	add			eax,8
+
+	movd		xmm1, dword [esi+ebx]
+	punpcklbw	xmm0, xmm1
+
+	punpcklbw	xmm0, xmm7
+
+	movq		xmm2, qword [edx]
+	pshufd		xmm2, xmm2, 01000100b
+
+	pmaddwd		xmm0, xmm2
+
+	paddd		xmm4, xmm0
+
+	add			edx,8
+
+	sub			ecx,1
+	jne			.coeffloop
+
+	psrad		xmm4,14
+	add			edi,4
+	packssdw	xmm4,xmm4
+	add			ebx,4
+	packuswb	xmm4,xmm4
+	sub			ebp,1
+
+	mov			ecx,[esp+16+16]		;ecx = filter width counter
+	mov			edx,[esp+12+16]		;edx = filter bank pointer
+
+	movd		dword [edi-4],xmm4
+	jne			.pixelloop
+
+.xit:
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+	ret
+
+.accel_4coeff:
+	shl			ebp, 2
+	mov			eax, [esp+8+16]			;eax = row pointer table
+	mov			esi, [eax+12]
+	mov			ecx, [eax+8]
+	mov			ebx, [eax+4]
+	mov			eax, [eax]
+	lea			edi, [edi+ebp-4]
+	neg			ebp
+
+	;registers:
+	;
+	;EAX	source 0
+	;EBX	source 1
+	;ECX	source 2
+	;ESI	source 3
+	;EDI	destination
+	;EBP	counter
+	;
+	movq		xmm4, qword [edx]				;xmm4 = coeff 0/1
+	movq		xmm5, qword [edx+8]			;xmm5 = coeff 2/3
+	punpcklqdq	xmm4, xmm4
+	punpcklqdq	xmm5, xmm5
+
+	add			ebp, 4
+	jz			.oddpixel_4coeff
+
+.pixelloop_4coeff_dualpel:
+	movq		xmm0, qword [eax]
+	movq		xmm1, qword [ebx]
+	movq		xmm2, qword [ecx]
+	movq		xmm3, qword [esi]
+	add			eax,8
+	add			ebx,8
+	add			ecx,8
+	add			esi,8
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	movdqa		xmm1, xmm0
+	movdqa		xmm3, xmm2
+	punpcklbw	xmm0, xmm7
+	punpckhbw	xmm1, xmm7
+	punpcklbw	xmm2, xmm7
+	punpckhbw	xmm3, xmm7
+	pmaddwd		xmm0, xmm4
+	pmaddwd		xmm1, xmm4
+	pmaddwd		xmm2, xmm5
+	pmaddwd		xmm3, xmm5
+	paddd		xmm0, xmm2
+	paddd		xmm1, xmm3
+	paddd		xmm0, xmm6
+	paddd		xmm1, xmm6
+	psrad		xmm0, 14
+	psrad		xmm1, 14
+	packssdw	xmm0, xmm1
+	packuswb	xmm0, xmm0
+	movq		qword [edi+ebp],xmm0
+	add			ebp, 8
+	jae			.pixelloop_4coeff_dualpel
+	jnz			.xit
+
+.oddpixel_4coeff:
+	movd		xmm0, dword [eax]
+	movd		xmm1, dword [ebx]
+	movd		xmm2, dword [ecx]
+	movd		xmm3, dword [esi]
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	punpcklbw	xmm0, xmm7
+	punpcklbw	xmm2, xmm7
+	pmaddwd		xmm0, xmm4
+	pmaddwd		xmm2, xmm5
+	paddd		xmm0, xmm2
+	paddd		xmm0, xmm6
+	psrad		xmm0, 14
+	packssdw	xmm0, xmm0
+	packuswb	xmm0, xmm0
+	movd		dword [edi],xmm0
+	jmp			.xit
+
+
+.accel_6coeff:
+	movq		xmm4, qword [edx]				;xmm4 = coeff 0/1
+	movq		xmm5, qword [edx+8]			;xmm5 = coeff 2/3
+	movq		xmm6, qword [edx+16]			;xmm5 = coeff 4/5
+	punpcklqdq	xmm4, xmm4
+	punpcklqdq	xmm5, xmm5
+	punpcklqdq	xmm6, xmm6
+
+	push		0
+	push		dword [fs:0]
+	mov			dword [fs:0],esp
+
+	shl			ebp, 2
+	mov			eax, [esp+8+24]			;eax = row pointer table
+	mov			esp, [eax+20]
+	mov			esi, [eax+16]
+	mov			edx, [eax+12]
+	mov			ecx, [eax+8]
+	mov			ebx, [eax+4]
+	mov			eax, [eax]
+	lea			edi, [edi+ebp-4]
+	neg			ebp
+
+	;registers:
+	;
+	;EAX	source 0
+	;EBX	source 1
+	;ECX	source 2
+	;EDX	source 3
+	;ESI	source 4
+	;EDI	destination
+	;ESP	source 5
+	;EBP	counter
+	;
+
+	add			ebp, 4
+	jz			.oddpixel_6coeff
+
+.pixelloop_6coeff_dualpel:
+	movq		xmm0, qword [eax]
+	movq		xmm1, qword [ebx]
+	movq		xmm2, qword [ecx]
+	movq		xmm3, qword [edx]
+	add			eax,8
+	add			ebx,8
+	add			ecx,8
+	add			edx,8
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	movdqa		xmm1, xmm0
+	movdqa		xmm3, xmm2
+	punpcklbw	xmm0, xmm7
+	punpckhbw	xmm1, xmm7
+	punpcklbw	xmm2, xmm7
+	punpckhbw	xmm3, xmm7
+	pmaddwd		xmm0, xmm4
+	pmaddwd		xmm1, xmm4
+	pmaddwd		xmm2, xmm5
+	pmaddwd		xmm3, xmm5
+	paddd		xmm0, xmm2
+	paddd		xmm1, xmm3
+
+	movq		xmm2, qword [esi]
+	movq		xmm3, qword [esp]
+	add			esi, 8
+	add			esp, 8
+	punpcklbw	xmm2, xmm3
+	movdqa		xmm3, xmm2
+	punpcklbw	xmm2, xmm7
+	punpckhbw	xmm3, xmm7
+	pmaddwd		xmm2, xmm6
+	pmaddwd		xmm3, xmm6
+	paddd		xmm0, xmm2
+	paddd		xmm1, xmm3
+	paddd		xmm0, [MMX_roundval]
+	paddd		xmm1, [MMX_roundval]
+	psrad		xmm0, 14
+	psrad		xmm1, 14
+	packssdw	xmm0, xmm1
+	packuswb	xmm0, xmm0
+	movq		qword [edi+ebp],xmm0
+	add			ebp, 8
+	jae			.pixelloop_6coeff_dualpel
+	jnz			.xit_6coeff
+
+.oddpixel_6coeff:
+	movd		xmm0, dword [eax]
+	movd		xmm1, dword [ebx]
+	movd		xmm2, dword [ecx]
+	movd		xmm3, dword [edx]
+	punpcklbw	xmm0, xmm1
+	punpcklbw	xmm2, xmm3
+	movd		xmm1, dword [esi]
+	movd		xmm3, dword [esp]
+	punpcklbw	xmm0, xmm7
+	punpcklbw	xmm2, xmm7
+	pmaddwd		xmm0, xmm4
+	punpcklbw	xmm1, xmm3
+	pmaddwd		xmm2, xmm5
+	punpcklbw	xmm1, xmm7
+	pmaddwd		xmm1, xmm6
+	paddd		xmm0, xmm2
+	paddd		xmm1, [MMX_roundval]
+	paddd		xmm0, xmm1
+	psrad		xmm0, 14
+	packssdw	xmm0, xmm0
+	packuswb	xmm0, xmm0
+	movd		dword [edi],xmm0
+
+.xit_6coeff:
+	mov			esp, dword [fs:0]
+	pop			dword [fs:0]
+	pop			eax
+	jmp			.xit
+
+
+	end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm
new file mode 100644
index 000000000..cf7332cb2
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm
@@ -0,0 +1,358 @@
+		segment	.rdata, align=16
+
+round		dq		0000000000002000h
+colround	dq		0000200000002000h
+
+		segment	.text
+		
+		global		_vdasm_resize_table_row_8_k8_4x_SSE41
+_vdasm_resize_table_row_8_k8_4x_SSE41:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		movq		xmm6, [round]
+		pshufd		xmm6, xmm6, 0
+
+		mov			ebp, [esp +  4 + 16]		;ebp = dst
+		mov			esi, [esp + 12 + 16]		;esi = width
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+.yloop:
+		;eax = temp
+		;ebx = temp
+		;ecx = temp
+		;edx = temp
+		;esi = horiz counter
+		;edi = filter list
+		;ebp = destination
+
+		mov			eax, [edi+0]
+		mov			ebx, [edi+4]
+		mov			ecx, [edi+8]
+		mov			edx, [esp+8+16]
+		add			eax, edx
+		add			ebx, edx
+		add			ecx, edx
+		add			edx, [edi+12]
+
+		pmovzxbw	xmm0, [eax]
+		pmaddwd		xmm0, [edi+10h]
+		pmovzxbw	xmm1, [ebx]
+		pmaddwd		xmm1, [edi+20h]
+		pmovzxbw	xmm2, [ecx]
+		pmaddwd		xmm2, [edi+30h]
+		pmovzxbw	xmm3, [edx]
+		pmaddwd		xmm3, [edi+40h]
+		add			edi, 50h
+		phaddd		xmm0, xmm1
+		phaddd		xmm2, xmm3
+		phaddd		xmm0, xmm2
+		paddd		xmm0, xmm6
+		psrad		xmm0, 14
+		packssdw	xmm0, xmm0
+		packuswb	xmm0, xmm0
+		movd		[ebp], xmm0
+
+		add			ebp, 4
+		sub			esi, 1
+		jne			.yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+		global		_vdasm_resize_table_row_8_k16_4x_SSE41
+_vdasm_resize_table_row_8_k16_4x_SSE41:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		movq		xmm6, [round]
+		pshufd		xmm6, xmm6, 0
+
+		mov			ebp, [esp +  4 + 16]		;ebp = dst
+		mov			esi, [esp + 12 + 16]		;esi = width
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+.yloop:
+		;eax = temp
+		;ebx = temp
+		;ecx = temp
+		;edx = temp
+		;esi = horiz counter
+		;edi = filter list
+		;ebp = destination
+
+		mov			eax, [edi+0]
+		mov			ebx, [edi+4]
+		mov			ecx, [edi+8]
+		mov			edx, [esp+8+16]
+		add			eax, edx
+		add			ebx, edx
+		add			ecx, edx
+		add			edx, [edi+12]
+
+		pmovzxbw	xmm0, [eax]
+		pmaddwd		xmm0, [edi+10h]
+		pmovzxbw	xmm1, [ebx]
+		pmaddwd		xmm1, [edi+20h]
+		pmovzxbw	xmm2, [ecx]
+		pmaddwd		xmm2, [edi+30h]
+		pmovzxbw	xmm3, [edx]
+		pmaddwd		xmm3, [edi+40h]
+		pmovzxbw	xmm4, [eax+8]
+		pmaddwd		xmm4, [edi+50h]
+		pmovzxbw	xmm5, [ebx+8]
+		pmaddwd		xmm5, [edi+60h]
+		paddd		xmm0, xmm4
+		pmovzxbw	xmm4, [ecx+8]
+		pmaddwd		xmm4, [edi+70h]
+		paddd		xmm1, xmm5
+		pmovzxbw	xmm5, [edx+8]
+		pmaddwd		xmm5, [edi+80h]
+		paddd		xmm2, xmm4
+		paddd		xmm3, xmm5
+		add			edi, 90h
+		phaddd		xmm0, xmm1
+		phaddd		xmm2, xmm3
+		phaddd		xmm0, xmm2
+		paddd		xmm0, xmm6
+		psrad		xmm0, 14
+		packssdw	xmm0, xmm0
+		packuswb	xmm0, xmm0
+		movd		[ebp], xmm0
+
+		add			ebp, 4
+		sub			esi, 1
+		jne			.yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+		global		_vdasm_resize_table_row_8_SSE41
+_vdasm_resize_table_row_8_SSE41:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		xmm7, xmm7
+		movq		xmm6, [round]
+
+		mov			edi, [esp +  4 + 16]		;edi = dst
+		mov			ebx, [esp +  8 + 16]		;ebx = src
+		mov			ebp, [esp + 12 + 16]		;ebp = width
+		mov			edx, [esp + 16 + 16]		;edx = kernel
+.yloop:
+		;eax = temp
+		;ebx = source base address
+		;ecx = (temp) source
+		;edx = filter list
+		;esi = (temp) kernel width
+		;edi = destination
+		;ebp = horiz counter
+
+		mov			eax, [edx]
+		add			edx, 16
+		lea			ecx, [ebx + eax]
+		mov			esi, [esp + 20 + 16]		;esi = kernel width
+
+		movq		xmm2, xmm6
+.xloop:
+		pmovzxbw	xmm0, [ecx]
+		add			ecx, 8
+		pmaddwd		xmm0, [edx]
+		paddd		xmm2, xmm0
+		add			edx, 16
+		sub			esi, 8
+		jne			.xloop
+
+		phaddd		xmm2, xmm2
+		phaddd		xmm2, xmm2
+		psrad		xmm2, 14
+		packssdw	xmm2, xmm2
+		packuswb	xmm2, xmm2
+		movd		eax, xmm2
+		mov			[edi], al
+		add			edi, 1
+		sub			ebp, 1
+		jne			.yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+		
+
+		global		_vdasm_resize_table_col_8_k2_SSE41
+_vdasm_resize_table_col_8_k2_SSE41:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		movq		xmm6, [colround]
+		pshufd		xmm6, xmm6, 0
+
+		mov			esi, [esp +  4 + 16]		;esi = dst
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+		mov			ebp, [esp + 12 + 16]		;ebp = width
+
+		movq		xmm7, [edi]
+		pshufd		xmm7, xmm7, 0
+
+		mov			edx, [esp +  8 + 16]		;ebx = srcs
+		mov			eax, [edx+0]
+		mov			ebx, [edx+4]
+		add			eax, ebp
+		add			ebx, ebp
+		neg			ebp
+		
+.yloop:
+		;eax = row0
+		;ebx = row1
+		;ecx =
+		;edx =
+		;edi = kernel
+		;esi = dest
+		;ebp = width counter
+
+		movd		xmm0, [eax+ebp]
+		movd		xmm2, [ebx+ebp]
+		punpcklbw	xmm0, xmm2
+		pmovzxbw	xmm0, xmm0
+		pmaddwd		xmm0, xmm7
+
+		paddd		xmm0, xmm6
+
+		psrad		xmm0, 14
+		packssdw	xmm0, xmm0
+		packuswb	xmm0, xmm0
+		movd		[esi], xmm0
+		add			esi, 4
+		add			ebp, 4
+		jnz			.yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+		global		_vdasm_resize_table_col_8_k4_SSE41
+_vdasm_resize_table_col_8_k4_SSE41:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		movq		xmm7, [colround]
+		pshufd		xmm7, xmm7, 0
+
+		mov			esi, [esp +  4 + 16]		;esi = dst
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+
+		movdqu		xmm6, [edi]
+		pshufd		xmm5, xmm6, 0
+		pshufd		xmm6, xmm6, 0aah
+
+		mov			edx, [esp +  8 + 16]		;ebx = srcs
+		mov			ebp, [esp + 12 + 16]
+		mov			eax, [edx+0]
+		mov			ebx, [edx+4]
+		mov			ecx, [edx+8]
+		mov			edx, [edx+12]
+		lea			eax, [eax+ebp-4]
+		lea			ebx, [ebx+ebp-4]
+		lea			ecx, [ecx+ebp-4]
+		lea			edx, [edx+ebp-4]
+		lea			esi, [esi+ebp-4]
+		neg			ebp
+		add			ebp,4
+		jz			.odd
+.yloop:
+		;eax = row0
+		;ebx = row1
+		;ecx = row2
+		;edx = row3
+		;edi = kernel
+		;esi = dest
+		;ebp = width counter
+
+		movd		xmm0, [eax+ebp]
+		movd		xmm1, [ebx+ebp]
+		punpcklbw	xmm0, xmm1
+
+		movd		xmm1, [ecx+ebp]
+		movd		xmm2, [edx+ebp]
+		punpcklbw	xmm1, xmm2
+
+		movd		xmm2, [eax+ebp+4]
+		movd		xmm3, [ebx+ebp+4]
+		punpcklbw	xmm2, xmm3
+		
+		movd		xmm3, [ecx+ebp+4]
+		movd		xmm4, [edx+ebp+4]
+		punpcklbw	xmm3, xmm4
+		
+		pmovzxbw	xmm0, xmm0
+		pmaddwd		xmm0, xmm5
+		
+		pmovzxbw	xmm1, xmm1
+		pmaddwd		xmm1, xmm6
+		
+		pmovzxbw	xmm2, xmm2
+		pmaddwd		xmm2, xmm5
+		
+		pmovzxbw	xmm3, xmm3
+		pmaddwd		xmm3, xmm6
+
+		paddd		xmm0, xmm1
+		paddd		xmm2, xmm3
+
+		paddd		xmm0, xmm7
+		paddd		xmm2, xmm7
+
+		psrad		xmm0, 14
+		psrad		xmm2, 14
+		
+		packssdw	xmm0, xmm2
+		packuswb	xmm0, xmm0
+		movq		[esi+ebp], xmm0
+		add			ebp, 8
+		js			.yloop
+		jnz			.noodd
+
+.odd:
+		movd		xmm0, [eax]
+		movd		xmm1, [ebx]
+		movd		xmm2, [ecx]
+		movd		xmm3, [edx]
+		punpcklbw	xmm0, xmm1
+		punpcklbw	xmm2, xmm3
+		pmovzxbw	xmm0, xmm0
+		pmovzxbw	xmm2, xmm2
+		pmaddwd		xmm0, xmm5
+		pmaddwd		xmm2, xmm6
+		paddd		xmm0, xmm2
+		paddd		xmm0, xmm7
+		psrad		xmm0, 14
+		packssdw	xmm0, xmm0
+		packuswb	xmm0, xmm0
+		movd		[esi], xmm0
+.noodd:
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm
new file mode 100644
index 000000000..3fe7cedbc
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm
@@ -0,0 +1,193 @@
+		section	.rdata, rdata, align=16
+
+xfefefefefefefefe	dq	0fefefefefefefefeh
+xe0e0e0e0e0e0e0e0	dq	0e0e0e0e0e0e0e0e0h
+x0002000200020002	dq	00002000200020002h
+
+		section	.text
+
+;==============================================================================
+		global _vdasm_horiz_expand2x_coaligned_ISSE
+_vdasm_horiz_expand2x_coaligned_ISSE:
+		mov			ecx, [esp+8]
+		mov			edx, [esp+4]
+		mov			eax, [esp+12]
+.xloop:
+		movq		mm0, [ecx]
+		movq		mm1, mm0
+		pavgb		mm0, [ecx+1]
+		movq		mm2, mm1
+		punpcklbw	mm1, mm0
+		punpckhbw	mm2, mm0
+
+		movq		[edx], mm1
+		movq		[edx+8], mm2
+		add			edx, 16
+		add			ecx, 8
+
+		sub			eax, 16
+		jne			.xloop
+		ret
+
+;==============================================================================
+		global	_vdasm_vert_average_13_ISSE
+_vdasm_vert_average_13_ISSE:
+		push	ebx
+		mov		ebx, [esp+12+4]
+		mov		ecx, [esp+8+4]
+		mov		edx, [esp+4+4]
+		mov		eax, [esp+16+4]
+
+		add		ebx, eax
+		add		ecx, eax
+		add		edx, eax
+		neg		eax
+
+		pcmpeqb	mm7, mm7
+.xloop:
+		movq	mm0, [ebx+eax]
+		movq	mm1, [ecx+eax]
+		movq	mm2, mm0
+
+		movq	mm3, [ebx+eax+8]
+		pxor	mm0, mm7
+		pxor	mm1, mm7
+
+		movq	mm4, [ecx+eax+8]
+		movq	mm5, mm3
+		pxor	mm3, mm7
+
+		pxor	mm4, mm7
+		pavgb	mm0, mm1
+		pavgb	mm3, mm4
+
+		pxor	mm0, mm7
+		pxor	mm3, mm7
+		pavgb	mm0, mm2
+
+		movq	[edx+eax], mm0
+		pavgb	mm3, mm5
+
+		movq	[edx+eax+8], mm3
+		add		eax, 16
+		jne		.xloop
+
+		pop		ebx
+		ret
+
+;==============================================================================
+		global	_vdasm_vert_average_17_ISSE
+_vdasm_vert_average_17_ISSE:
+		push	ebx
+		mov		ebx, [esp+12+4]
+		mov		ecx, [esp+8+4]
+		mov		edx, [esp+4+4]
+		mov		eax, [esp+16+4]
+
+		add		ebx, eax
+		add		ecx, eax
+		add		edx, eax
+		neg		eax
+
+		;r = avgup(avgdown(avgdown(a, b), a), a)
+		;  = pavgb(~pavgb(pavgb(~a, ~b), ~a), a)
+		
+		pcmpeqb		mm7, mm7
+.xloop:
+		movq		mm0, [ecx+eax]
+		movq		mm1, [ebx+eax]
+		movq		mm2, mm0
+		pxor		mm0, mm7			;~a
+		pxor		mm1, mm7			;~b
+		pavgb		mm1, mm0			;pavgb(~a, ~b) = ~avgdown(a, b)
+		pavgb		mm1, mm0			;pavgb(~avgdown(a, b), ~a) = ~avgdown(avgdown(a, b), a)
+		pxor		mm1, mm7			;avgdown(avgdown(a, b), a)
+		pavgb		mm1, mm2			;pavgb(avgdown(avgdown(a, b), a), a) = round((7*a + b)/8)
+		movq		[edx+eax], mm1
+		
+		add		eax, 8
+		jne		.xloop
+
+		pop		ebx
+		ret
+
+;==============================================================================
+		global	_vdasm_vert_average_35_ISSE
+_vdasm_vert_average_35_ISSE:
+		push	ebx
+		mov		ebx, [esp+12+4]
+		mov		ecx, [esp+8+4]
+		mov		edx, [esp+4+4]
+		mov		eax, [esp+16+4]
+
+		add		ebx, eax
+		add		ecx, eax
+		add		edx, eax
+		neg		eax
+
+		;r = avgup(avgdown(avgdown(a, b), b), a)
+		;  = pavgb(~pavgb(pavgb(~a, ~b), ~b), a)
+		
+		pcmpeqb		mm7, mm7
+.xloop:
+		movq		mm0, [ecx+eax]
+		movq		mm1, [ebx+eax]
+		movq		mm2, mm0
+		pxor		mm0, mm7		;~a
+		pxor		mm1, mm7		;~b
+		pavgb		mm0, mm1		;avgup(~a, ~b) = ~avgdown(a, b)
+		pavgb		mm0, mm1		;avgup(~avgdown(a, b), ~b) = ~avgdown(avgdown(a, b), b)
+		pxor		mm0, mm7		;avgdown(avgdown(a, b), b)
+		pavgb		mm0, mm2		;avgup(avgdown(avgdown(a, b), b), a) = round((5*a + 3*b) / 8)
+		movq		[edx+eax], mm0
+		
+		add		eax, 8
+		jne		.xloop
+
+		pop		ebx
+		ret
+
+;==============================================================================
+		global	_vdasm_horiz_expand4x_coaligned_MMX
+_vdasm_horiz_expand4x_coaligned_MMX:
+		mov			edx, [esp+4]
+		mov			ecx, [esp+8]
+		mov			eax, [esp+12]
+		movq		mm6, qword [x0002000200020002]
+		pxor		mm7, mm7
+.xloop:
+		movd		mm0, [ecx]
+		movd		mm1, [ecx+1]
+		add			ecx, 4
+		punpcklbw	mm0, mm7
+		punpcklbw	mm1, mm7
+		psubw		mm1, mm0		;x1
+		movq		mm2, mm1
+		paddw		mm1, mm6		;x1 + 2
+		movq		mm3, mm1
+		paddw		mm2, mm2		;x2
+		paddw		mm3, mm2		;x3 + 2
+		paddw		mm2, mm6		;x2 + 2
+		psraw		mm1, 2			;x1/4
+		psraw		mm2, 2			;x2/4
+		psraw		mm3, 2			;x3/4
+		paddw		mm1, mm0
+		paddw		mm2, mm0
+		paddw		mm3, mm0
+		movd		mm0, [ecx-4]
+		packuswb	mm1, mm1
+		packuswb	mm2, mm2
+		packuswb	mm3, mm3
+		punpcklbw	mm0, mm1
+		punpcklbw	mm2, mm3
+		movq		mm1, mm0
+		punpcklwd	mm0, mm2
+		punpckhwd	mm1, mm2
+		
+		movq		[edx], mm0
+		movq		[edx+8], mm1
+		add			edx, 16
+		sub			eax, 1
+		jne			.xloop
+		
+		ret
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm
new file mode 100644
index 000000000..3db442fa2
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm
@@ -0,0 +1,326 @@
+		segment	.rdata, align=16
+
+x0020w			dq	00020002000200020h
+rb_mask_555		dq	07c1f7c1f7c1f7c1fh
+g_mask_555		dq	003e003e003e003e0h
+rb_mask_888		dq	000ff00ff00ff00ffh
+g_mask_888		dq	00000ff000000ff00h
+
+		segment	.text
+
+		struc	VDPixmapReferenceStretchBltBilinearParameters
+.dst		resd	1
+.src		resd	1
+.u			resd	1
+.uinc		resd	1
+.dudx		resd	1
+
+.xprepos	resd	1
+.xpostpos	resd	1
+.xprecopy	resd	1
+.xpostcopy	resd	1
+.xmidsize	resd	1
+		endstruc
+
+
+
+		global	_vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX
+_vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+20+16]
+		and			eax, 0f8000000h
+		mov			ebx, [esp+8+16]
+		mov			ecx, [esp+12+16]
+		jz			.noreverse
+		xchg		ebx, ecx
+		js			.noreverse
+		neg			eax
+		xchg		ebx, ecx
+.noreverse:
+		shr			eax, 16
+		mov			[esp+20+16], eax
+		mov			edx, [esp+4+16]
+		mov			eax, [esp+16+16]
+		add			eax, eax
+		lea			ebx, [ebx+eax-6]
+		lea			ecx, [ecx+eax-6]
+		lea			edx, [edx+eax-6]
+		neg			eax
+
+		movd		mm4, dword [esp+20+16]
+		punpcklwd	mm4, mm4
+		punpckldq	mm4, mm4
+
+		movq		mm6, [rb_mask_555]
+		movq		mm7, [g_mask_555]
+
+.xstart:
+		add			eax, 6
+		jbe			.doodd
+.xloop:
+		movq		mm0, [ebx+eax]
+		movq		mm1, [ecx+eax]
+		movq		mm2, mm7
+		movq		mm3, mm7
+
+		pand		mm2, mm0
+		pand		mm3, mm1
+		pand		mm0, mm6
+		pand		mm1, mm6
+
+		psubw		mm3, mm2
+		psubw		mm1, mm0
+
+		pmulhw		mm3, mm4
+		pmulhw		mm1, mm4
+
+		psubw		mm0, mm1
+		psubw		mm2, mm3
+
+		pand		mm0, mm6
+		pand		mm2, mm7
+
+		paddw		mm0, mm2
+
+		movq		[edx+eax], mm0
+		add			eax, 8
+		jnc			.xloop
+
+.doodd:
+		sub			eax, 6
+		jz			.noodd
+.odd:
+		movzx		esi, word [ebx+eax+6]
+		movd		mm0, esi
+		movzx		esi, word [ecx+eax+6]
+		movd		mm1, esi
+		movq		mm2, mm7
+		movq		mm3, mm7
+
+		pand		mm2, mm0
+		pand		mm3, mm1
+		pand		mm0, mm6
+		pand		mm1, mm6
+
+		psubw		mm3, mm2
+		psubw		mm1, mm0
+
+		pmulhw		mm3, mm4
+		pmulhw		mm1, mm4
+
+		psubw		mm0, mm1
+		psubw		mm2, mm3
+
+		pand		mm0, mm6
+		pand		mm2, mm7
+
+		paddw		mm0, mm2
+
+		movd		esi, mm0
+		mov			[edx+eax+6], si
+		add			eax,2
+		jne			.odd
+
+.noodd:
+		emms
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+
+	global	_vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX
+_vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			edx, [esp+4+16]
+
+		mov			ebx, [edx+VDPixmapReferenceStretchBltBilinearParameters.src]
+		mov			edi, [edx+VDPixmapReferenceStretchBltBilinearParameters.dst]
+
+		mov			ecx, [edx+VDPixmapReferenceStretchBltBilinearParameters.xprecopy]
+		or			ecx, ecx
+		jz			.noprecopy
+		mov			eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.xprepos]
+		mov			eax, [ebx+eax]
+		lea			ebp, [ecx*4]
+		sub			edi, ebp
+		rep			stosd
+.noprecopy:
+		mov			ebp, [edx+VDPixmapReferenceStretchBltBilinearParameters.xmidsize]
+		add			ebp, ebp
+		add			ebp, ebp
+		add			edi, ebp
+		neg			ebp
+
+		mov			esi, [edx+VDPixmapReferenceStretchBltBilinearParameters.u]
+		mov			eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.dudx]
+		mov			edx, [edx+VDPixmapReferenceStretchBltBilinearParameters.uinc]
+		movd		mm2, esi
+		movd		mm3, eax
+		shr			ebx, 2
+
+		movq		mm5, mm2
+		punpcklwd	mm5, mm5
+		punpckhdq	mm5, mm5
+		movq		mm4, mm5
+		psraw		mm4, 15
+
+.xloop:
+		movd		mm0, dword [ebx*4]
+		pxor		mm7, mm7
+		movd		mm1, dword [ebx*4+4]
+		punpcklbw	mm0, mm7
+		punpcklbw	mm1, mm7
+		psubw		mm1, mm0
+		pand		mm4, mm1
+		pmulhw		mm1, mm5
+		paddw		mm1, mm4
+		paddw		mm0, mm1
+		packuswb	mm0, mm0
+		movd		dword [edi+ebp], mm0
+
+		add			esi, eax
+		adc			ebx, edx
+
+		paddd		mm2, mm3
+		movq		mm5, mm2
+		punpcklwd	mm5, mm5
+		punpckhdq	mm5, mm5
+		movq		mm4, mm5
+		psraw		mm4, 15
+		add			ebp, 4
+		jnz			.xloop
+
+		mov			edx, [esp+4+16]
+		mov			ecx, [edx+VDPixmapReferenceStretchBltBilinearParameters.xpostcopy]
+		or			ecx, ecx
+		jz			.nopostcopy
+		mov			eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.xpostpos]
+		add			eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.src]
+		mov			eax, [eax]
+		rep			stosd
+.nopostcopy:
+
+		emms
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+	global	_vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX
+_vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		mov			eax, [esp+20+16]
+		and			eax, 0ff000000h
+		mov			ebx, [esp+8+16]
+		mov			ecx, [esp+12+16]
+		jz			.noreverse
+		xchg		ebx, ecx
+		js			.noreverse
+		neg			eax
+		xchg		ebx, ecx
+.noreverse:
+		shr			eax, 16
+		mov			[esp+20+16], eax
+		mov			edx, [esp+4+16]
+		mov			eax, [esp+16+16]
+		add			eax, eax
+		add			eax, eax
+		lea			ebx, [ebx+eax-4]
+		lea			ecx, [ecx+eax-4]
+		lea			edx, [edx+eax-4]
+		neg			eax
+
+		movd		mm4, dword [esp+20+16]
+		punpcklwd	mm4, mm4
+		punpckldq	mm4, mm4
+
+		movq		mm6, [rb_mask_888]
+		movq		mm7, [g_mask_888]
+
+.xstart:
+		add			eax, 4
+		jbe			.doodd
+.xloop:
+		movq		mm0, [ebx+eax]
+		movq		mm1, [ecx+eax]
+		movq		mm2, mm0
+		movq		mm3, mm1
+		psrlw		mm2, 8
+		psrlw		mm3, 8
+		pand		mm0, mm6
+		pand		mm1, mm6
+
+		psubw		mm3, mm2
+		psubw		mm1, mm0
+
+		pmulhw		mm3, mm4
+		pmulhw		mm1, mm4
+
+		psubw		mm0, mm1
+		psubw		mm2, mm3
+
+		pand		mm0, mm6
+
+		psllw		mm2, 8
+
+		paddw		mm0, mm2
+
+		movq		qword [edx+eax], mm0
+		add			eax, 8
+		jnc			.xloop
+
+.doodd:
+		sub			eax, 4
+		jz			.noodd
+.odd:
+		movd		mm0, dword [ebx]
+		movd		mm1, dword [ecx]
+		movq		mm2, mm0
+		movq		mm3, mm1
+		psrlw		mm2, 8
+		psrlw		mm3, 8
+		pand		mm0, mm6
+		pand		mm1, mm6
+
+		psubw		mm3, mm2
+		psubw		mm1, mm0
+
+		pmulhw		mm3, mm4
+		pmulhw		mm1, mm4
+
+		psubw		mm0, mm1
+		psubw		mm2, mm3
+
+		pand		mm0, mm6
+
+		psllw		mm2, 8
+
+		paddw		mm0, mm2
+
+		movd		dword [edx], mm0
+
+.noodd:
+		emms
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm
new file mode 100644
index 000000000..dca765b92
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm
@@ -0,0 +1,96 @@
+		segment	.text
+
+		struc	scaleinfo
+.dst		resd	1
+.src		resd	1
+.xaccum		resd	1
+.xfracinc	resd	1
+.xintinc	resd	1
+.count		resd	1
+		endstruc
+
+		global	_vdasm_resize_point32
+_vdasm_resize_point32:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		eax, [esp+4+16]
+
+		mov		ebx, [eax+scaleinfo.xaccum]
+		mov		ecx, [eax+scaleinfo.xfracinc]
+		mov		edx, [eax+scaleinfo.src]
+		mov		esi, [eax+scaleinfo.xintinc]
+		mov		edi, [eax+scaleinfo.dst]
+		mov		ebp, [eax+scaleinfo.count]
+.xloop:
+		mov		eax,[edx*4]
+		add		ebx,ecx
+		adc		edx,esi
+		mov		[edi+ebp],eax
+		add		ebp,4
+		jne		.xloop
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		global	_vdasm_resize_point32_MMX
+_vdasm_resize_point32_MMX:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+
+		mov		eax, [esp+4+16]
+
+		push	0
+		push	dword [fs:0]
+		mov		dword [fs:0], esp
+
+		mov		ebx, [eax+scaleinfo.xaccum]
+		mov		esp, [eax+scaleinfo.xfracinc]
+		mov		edx, [eax+scaleinfo.src]
+		mov		esi, [eax+scaleinfo.xintinc]
+		mov		edi, [eax+scaleinfo.dst]
+		mov		ebp, [eax+scaleinfo.count]
+
+		mov		eax, ebx
+		mov		ecx, edx
+		add		ebx, esp
+		adc		edx, esi
+		add		esp, esp
+		adc		esi, esi
+
+		add		ebp, 4
+		jz		.odd
+.dualloop:
+		movd		mm0, dword [ecx*4]
+		punpckldq	mm0,[edx*4]
+		add		eax,esp
+		adc		ecx,esi
+		add		ebx,esp
+		adc		edx,esi
+		movq	[edi+ebp-4],mm0
+
+		add		ebp,8
+		jnc		.dualloop
+		jnz		.noodd
+.odd:
+		mov		eax, [ecx*4]
+		mov		[edi-4], eax
+.noodd:
+		mov		esp, dword [fs:0]
+		pop		eax
+		pop		eax
+
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc
new file mode 100644
index 000000000..fb969c56f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc
@@ -0,0 +1,24 @@
+		struc	span
+.u		resd	1
+.v		resd	1
+		endstruc
+
+		struc	mipspan
+.u		resd	1
+.v		resd	1
+.lambda	resd	1
+		endstruc
+
+		struc	mipmap
+.bits	resd	1
+.pitch	resd	1
+.uvmul	resd	1
+		resd	1
+		endstruc
+
+		struc	texinfo
+.mips	resd	16*4
+.dst	resd	1
+.src	resd	1
+.w		resd	1
+		endstruc
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm
new file mode 100644
index 000000000..3836488aa
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm
@@ -0,0 +1,425 @@
+		segment	.rdata, align=16
+
+correct		dq			0000800000008000h
+round		dq			0000200000002000h
+round1		dq			0000020000000200h
+round2		dq			0002000000020000h
+
+		segment	.text
+
+		%include	"a_triblt.inc"
+
+		extern		_kVDCubicInterpTableFX14_075_MMX
+
+;--------------------------------------------------------------------------
+	global	_vdasm_triblt_span_bilinear_mmx
+_vdasm_triblt_span_bilinear_mmx:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+		mov			edi,[esp+4+16]
+		mov			edx,[edi+texinfo.dst]
+		mov			ebp,[edi+texinfo.w]
+		shl			ebp,2
+		mov			ebx,[edi+texinfo.mips+mipmap.bits]
+		add			edx,ebp
+		mov			esi,[edi+texinfo.mips+mipmap.pitch]
+		neg			ebp
+		movd		mm6,[edi+texinfo.mips+mipmap.uvmul]
+		pxor		mm7,mm7
+		mov			edi,[edi+texinfo.src]
+.xloop:
+		movq		mm4,[edi]
+		movq		mm0,mm4
+		psrld		mm0,16
+		movq		mm5,mm4
+		packssdw	mm0,mm0
+		pmaddwd		mm0,mm6
+		add			edi,8
+		punpcklwd	mm4,mm4
+		punpckldq	mm4,mm4
+		movd		ecx,mm0
+		add			ecx,ebx
+		psrlw		mm4,1
+		movd		mm0,dword [ecx]
+		movd		mm1,dword [ecx+4]
+		punpcklbw	mm0,mm7
+		movd		mm2,dword [ecx+esi]
+		punpcklbw	mm1,mm7
+		movd		mm3,dword [ecx+esi+4]
+		punpcklbw	mm2,mm7
+		punpcklbw	mm3,mm7
+		psubw		mm1,mm0
+		psubw		mm3,mm2
+		paddw		mm1,mm1
+		paddw		mm3,mm3
+		pmulhw		mm1,mm4
+		pmulhw		mm3,mm4
+		punpckhwd	mm5,mm5
+		punpckldq	mm5,mm5
+		paddw		mm0,mm1
+		psrlw		mm5,1
+		paddw		mm2,mm3
+		psubw		mm2,mm0
+		paddw		mm2,mm2
+		pmulhw		mm2,mm5
+		paddw		mm0,mm2
+		packuswb	mm0,mm0
+		movd		dword [edx+ebp],mm0
+		add			ebp,4
+		jnc			.xloop
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		emms
+		ret
+		
+;--------------------------------------------------------------------------
+	global	_vdasm_triblt_span_trilinear_mmx
+_vdasm_triblt_span_trilinear_mmx:
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+		mov			esi,[esp+4+16]
+		mov			edx,[esi+texinfo.dst]
+		mov			ebp,[esi+texinfo.w]
+		shl			ebp,2
+		add			edx,ebp
+		neg			ebp
+		mov			edi,[esi+texinfo.src]
+		pxor		mm7,mm7
+.xloop:
+		movd		mm6,[edi+mipspan.u]
+		punpckldq	mm6,[edi+mipspan.v]
+		mov			eax,[edi+mipspan.lambda]
+		shr			eax,4
+		and			eax,byte -16
+		movd		mm2,eax
+		psrlq		mm2,4
+		psrld		mm6,mm2
+		paddd		mm6,[correct]
+
+		;fetch mipmap 1
+		mov			ebx,[esi+eax+mipmap.pitch]
+		movd		mm1,[esi+eax+mipmap.uvmul]
+		movq		mm4,mm6
+		movq		mm0,mm6
+		psrld		mm0,16
+		packssdw	mm0,mm0
+		pmaddwd		mm0,mm1
+		movq		mm5,mm4
+		punpcklwd	mm4,mm4
+		punpckldq	mm4,mm4
+		punpckhwd	mm5,mm5
+		punpckldq	mm5,mm5
+		movd		ecx,mm0
+		add			ecx,[esi+eax+mipmap.bits]
+		psrlw		mm4,1
+		movd		mm0,dword [ecx]
+		movd		mm1,dword [ecx+4]
+		punpcklbw	mm0,mm7
+		movd		mm2,dword [ecx+ebx]
+		punpcklbw	mm1,mm7
+		movd		mm3,dword [ecx+ebx+4]
+		punpcklbw	mm2,mm7
+		punpcklbw	mm3,mm7
+		psubw		mm1,mm0
+		psubw		mm3,mm2
+		paddw		mm1,mm1
+		paddw		mm3,mm3
+		pmulhw		mm1,mm4
+		pmulhw		mm3,mm4
+		paddw		mm0,mm1
+		psrlw		mm5,1
+		paddw		mm2,mm3
+		psubw		mm2,mm0
+		paddw		mm2,mm2
+		pmulhw		mm2,mm5
+		paddw		mm0,mm2
+
+		;fetch mipmap 2
+		mov			ebx,[esi+eax+16+mipmap.pitch]
+		movd		mm1,[esi+eax+16+mipmap.uvmul]
+		paddd		mm6,[correct]
+		psrld		mm6,1
+		movq		mm4,mm6
+		psrld		mm6,16
+		packssdw	mm6,mm6
+		pmaddwd		mm6,mm1
+		movq		mm5,mm4
+		punpcklwd	mm4,mm4
+		punpckldq	mm4,mm4
+		punpckhwd	mm5,mm5
+		punpckldq	mm5,mm5
+		movd		ecx,mm6
+		add			ecx,[esi+eax+16+mipmap.bits]
+		psrlw		mm4,1
+		movd		mm6,dword [ecx]
+		movd		mm1,dword [ecx+4]
+		punpcklbw	mm6,mm7
+		movd		mm2,dword [ecx+ebx]
+		punpcklbw	mm1,mm7
+		movd		mm3,dword [ecx+ebx+4]
+		punpcklbw	mm2,mm7
+		punpcklbw	mm3,mm7
+		psubw		mm1,mm6
+		psubw		mm3,mm2
+		paddw		mm1,mm1
+		paddw		mm3,mm3
+		pmulhw		mm1,mm4
+		pmulhw		mm3,mm4
+		paddw		mm6,mm1
+		psrlw		mm5,1
+		paddw		mm2,mm3
+		psubw		mm2,mm6
+		paddw		mm2,mm2
+		pmulhw		mm2,mm5
+		paddw		mm6,mm2
+
+		;blend mips
+		movd		mm1,[edi+mipspan.lambda]
+		punpcklwd	mm1,mm1
+		punpckldq	mm1,mm1
+		psllw		mm1,8
+		psrlq		mm1,1
+		psubw		mm6,mm0
+		paddw		mm6,mm6
+		pmulhw		mm6,mm1
+		paddw		mm0,mm6
+		packuswb	mm0,mm0
+
+		movd		dword [edx+ebp],mm0
+		add			edi, mipspan_size
+		add			ebp,4
+		jnc			.xloop
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		emms
+		ret
+
+;--------------------------------------------------------------------------
+%macro .SETUPADDR 1
+		;compute mipmap index and UV
+		movd		mm0, [edi + mipspan.u]
+		punpckldq	mm0, [edi + mipspan.v]
+		mov			ebx, [edi + mipspan.lambda]
+		shr			ebx, 4
+		and			ebx, byte -16
+		
+		add			ebx, mipmap_size*%1
+		movd		mm2, ebx
+		add			ebx, [esp + .af_mipbase]
+		psrlq		mm2, 4
+		psrad		mm0, mm2
+		paddd		mm0, [correct]
+		movq		mm1, mm0
+		psrlq		mm1, 32
+
+		;compute horizontal filters
+		movd		ecx, mm0
+		shr			ecx, 4
+		and			ecx, 0ff0h
+		add			ecx, _kVDCubicInterpTableFX14_075_MMX
+		
+		;compute vertical filter
+		movd		edx, mm1
+		and			edx, 0ff00h
+		shr			edx, 4
+		add			edx, _kVDCubicInterpTableFX14_075_MMX
+
+		;compute texel address
+		movd		mm1, [ebx + mipmap.uvmul]
+		psrld		mm0, 16
+		packssdw	mm0, mm0
+		pmaddwd		mm0, mm1
+		movd		eax, mm0
+		add			eax, [ebx + mipmap.bits]
+%endmacro
+		
+%macro .HCUBIC 4
+		movd		%1, dword [eax]
+		punpcklbw	%1, qword [eax+4]
+		movd		%3, dword [eax+8]
+		punpcklbw	%3, qword [eax+12]
+		movq		%2, %1
+		movq		%4, %3
+		punpcklbw	%1, mm7
+		pmaddwd		%1, [ecx]
+		punpcklbw	%3, mm7
+		pmaddwd		%3, [ecx+8]
+		punpckhbw	%2, mm7
+		pmaddwd		%2, [ecx]
+		punpckhbw	%4, mm7
+		pmaddwd		%4, [ecx+8]
+		paddd		%1, %3
+		paddd		%2, %4
+%endmacro
+
+%macro	.VCUBIC		1
+		.HCUBIC		mm0, mm1, mm2, mm3
+		add			eax, %1
+
+		.HCUBIC		mm4, mm5, mm2, mm3
+		add			eax, %1
+		
+		movq		mm2, [round1]
+		
+		paddd		mm0, mm2
+		paddd		mm1, mm2
+		paddd		mm4, mm2
+		paddd		mm5, mm2
+
+		psrad		mm0, 10
+		psrad		mm1, 10
+		psrad		mm4, 10
+		psrad		mm5, 10
+		
+		packssdw	mm0, mm0
+		packssdw	mm1, mm1
+		packssdw	mm4, mm4
+		packssdw	mm5, mm5
+				
+		punpcklwd	mm0, mm4
+		punpcklwd	mm1, mm5
+		
+		movq		mm3, [edx]
+		
+		pmaddwd		mm0, mm3
+		pmaddwd		mm1, mm3
+		
+		movq		[esp + .af_htemp0], mm0
+		movq		[esp + .af_htemp1], mm1
+		
+		.HCUBIC		mm0, mm1, mm2, mm3
+		add			eax, %1
+		.HCUBIC		mm4, mm5, mm2, mm3
+
+		movq		mm2, [round1]
+		
+		paddd		mm0, mm2
+		paddd		mm1, mm2
+		paddd		mm4, mm2
+		paddd		mm5, mm2
+
+		psrad		mm0, 10
+		psrad		mm1, 10
+		psrad		mm4, 10
+		psrad		mm5, 10
+		
+		packssdw	mm0, mm0
+		packssdw	mm1, mm1
+		packssdw	mm4, mm4
+		packssdw	mm5, mm5
+				
+		punpcklwd	mm0, mm4
+		punpcklwd	mm1, mm5
+
+		movq		mm2, [round2]		
+		movq		mm3, [edx + 8]
+		
+		pmaddwd		mm0, mm3
+		pmaddwd		mm1, mm3
+		
+		paddd		mm0, [esp + .af_htemp0]
+		paddd		mm1, [esp + .af_htemp1]
+		
+		paddd		mm0, mm2
+		paddd		mm1, mm2
+		
+		psrad		mm0, 18
+		psrad		mm1, 18
+		packssdw	mm0, mm1
+%endmacro
+
+	global	_vdasm_triblt_span_bicubic_mip_linear_mmx
+_vdasm_triblt_span_bicubic_mip_linear_mmx:
+
+;parameters
+%define .p_texinfo	20
+
+;aligned frame
+%define .af_htemp0	0
+%define .af_htemp1	8
+%define .af_vtemp0	16
+%define .af_mipbase	24
+%define	.af_prevesp	28
+%define .afsize		32
+
+		push		ebp
+		lea			ebp, [esp-12]
+		push		edi
+		push		esi
+		push		ebx
+		
+		sub			esp, .afsize
+		and			esp, -8
+		
+		mov			[esp + .af_prevesp], ebp
+		
+		mov			ebx, [ebp + .p_texinfo]
+		mov			ebp, [ebx + texinfo.dst]
+		mov			esi, [ebx + texinfo.w]
+		shl			esi, 2
+		add			ebp,esi
+		neg			esi
+
+		mov			edi, [ebx + texinfo.src]
+		mov			[esp + .af_mipbase], ebx
+		pxor		mm7, mm7
+
+.xloop:
+
+		;registers:
+		;	eax		base texel address
+		;	ebx		first mip info
+		;	ecx		horizontal filter
+		;	edx		vertical filter
+		;	esi		horizontal count
+		;	edi		mipspan
+		;	ebp		destination
+
+		;fetch mipmap 1
+		.SETUPADDR	0
+		.VCUBIC		[ebx+mipmap.pitch]
+		
+		movq		[esp + .af_vtemp0], mm0
+
+		;fetch mipmap 2		
+		.SETUPADDR	1
+		.VCUBIC		[ebx+mipmap.pitch]
+		
+		;blend mips
+		movq		mm1, [esp + .af_vtemp0]
+		
+		psubw		mm0, mm1
+
+		movd		mm3,[edi+mipspan.lambda]
+		punpcklwd	mm3,mm3
+		punpckldq	mm3,mm3
+		psllw		mm3,8
+		psrlq		mm3,1
+		
+		paddw		mm0,mm0
+		pmulhw		mm0,mm3
+		paddw		mm0,mm1
+		packuswb	mm0,mm0
+
+		movd		dword [ebp+esi],mm0
+		add			edi, mipspan_size
+		add			esi,4
+		jnc			.xloop
+
+		mov			esp, [esp + .af_prevesp]
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		emms
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm
new file mode 100644
index 000000000..c550634f3
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm
@@ -0,0 +1,36 @@
+		segment	.text
+
+		%include	"a_triblt.inc"
+
+	global	_vdasm_triblt_span_point
+_vdasm_triblt_span_point:
+		push	ebp
+		push	edi
+		push	esi
+		push	ebx
+		mov		eax,[esp+4+16]
+		mov		ebp,[eax+texinfo.w]
+		mov		ebx,[eax+texinfo.mips+mipmap.pitch]
+		shl		ebp,2
+		mov		edi,[eax+texinfo.src]
+		mov		edx,[eax+texinfo.dst]
+		mov		ecx,[eax+texinfo.mips+mipmap.bits]
+		sar		ebx,2
+		add		edx,ebp
+		neg		ebp
+.xloop:
+		mov		eax,[edi+span.v]
+		imul	eax,ebx
+		add		eax,[edi+span.u]
+		add		edi,8
+		mov		eax,[ecx+eax*4]
+		mov		[edx+ebp],eax
+		add		ebp,4
+		jnc		.xloop
+		pop		ebx
+		pop		esi
+		pop		edi
+		pop		ebp
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm
new file mode 100644
index 000000000..54514b317
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm
@@ -0,0 +1,197 @@
+		segment	.rdata, align=16
+
+correct		dq			0000800000008000h, 0000800000008000h
+round		dq			0000200000002000h, 0000200000002000h
+round1		dq			0000020000000200h, 0000020000000200h
+round2		dq			0002000000020000h, 0002000000020000h
+
+		segment	.text
+
+		%include	"a_triblt.inc"
+
+		extern		_kVDCubicInterpTableFX14_075_MMX
+
+;--------------------------------------------------------------------------
+	global	_vdasm_triblt_span_bicubic_mip_linear_sse2
+_vdasm_triblt_span_bicubic_mip_linear_sse2:
+
+;parameters
+%define .p_texinfo	20
+
+;aligned frame
+%define .af_vtemp0	0
+%define .af_mipbase	16
+%define	.af_prevesp	20
+%define .afsize		24
+
+		push		ebp
+		lea			ebp, [esp-12]
+		push		edi
+		push		esi
+		push		ebx
+		
+		sub			esp, .afsize
+		and			esp, -16
+		
+		mov			[esp + .af_prevesp], ebp
+		
+		mov			ebx, [ebp + .p_texinfo]
+		mov			ebp, [ebx + texinfo.dst]
+		mov			esi, [ebx + texinfo.w]
+		shl			esi, 2
+		add			ebp,esi
+		neg			esi
+
+		mov			edi, [ebx + texinfo.src]
+		mov			[esp + .af_mipbase], ebx
+		pxor		xmm7, xmm7
+
+.xloop:
+
+		;registers:
+		;	eax		base texel address
+		;	ebx		first mip info
+		;	ecx		horizontal filter
+		;	edx		vertical filter
+		;	esi		horizontal count
+		;	edi		mipspan
+		;	ebp		destination
+
+%macro .SETUPADDR 1
+		;compute mipmap index and UV
+		movd		xmm0, [edi + mipspan.u]
+		movd		xmm1, [edi + mipspan.v]
+		punpckldq	xmm0, xmm1
+		mov			ebx, [edi + mipspan.lambda]
+		shr			ebx, 4
+		and			ebx, byte -16
+		
+		add			ebx, mipmap_size*%1
+		movd		xmm2, ebx
+		add			ebx, [esp + .af_mipbase]
+		psrlq		xmm2, 4
+		psrad		xmm0, xmm2
+		paddd		xmm0, [correct]
+		pshufd		xmm1, xmm0, 01010101b
+
+		;compute horizontal filters
+		movd		ecx, xmm0
+		shr			ecx, 4
+		and			ecx, 0ff0h
+		add			ecx, _kVDCubicInterpTableFX14_075_MMX
+		
+		;compute vertical filter
+		movd		edx, xmm1
+		and			edx, 0ff00h
+		shr			edx, 4
+		add			edx, _kVDCubicInterpTableFX14_075_MMX
+
+		;compute texel address
+		movd		xmm1, [ebx + mipmap.uvmul]
+		psrld		xmm0, 16
+		packssdw	xmm0, xmm0
+		pmaddwd		xmm0, xmm1
+		movd		eax, xmm0
+		add			eax, [ebx + mipmap.bits]
+%endmacro
+		
+%macro .HCUBIC 4
+		movd		%1, dword [eax]
+		movd		%3, dword [eax+4]
+		movd		%2, dword [eax+8]
+		movd		%4, dword [eax+12]		
+		punpcklbw	%1, %3
+		punpcklbw	%2, %4
+		punpcklbw	%1, xmm7
+		punpcklbw	%2, xmm7
+		movdqa		%3, [ecx]
+		pshufd		%4, %3, 11101110b
+		pshufd		%3, %3, 01000100b
+		pmaddwd		%1, %3
+		pmaddwd		%2, %4
+		paddd		%1, %2
+%endmacro
+
+%macro	.VCUBIC		1
+		.HCUBIC		xmm0, xmm4, xmm5, xmm6
+		add			eax, %1		
+		.HCUBIC		xmm1, xmm4, xmm5, xmm6
+		add			eax, %1
+		.HCUBIC		xmm2, xmm4, xmm5, xmm6
+		add			eax, %1		
+		.HCUBIC		xmm3, xmm4, xmm5, xmm6
+		
+		movq		xmm4, [round1]
+		
+		paddd		xmm0, xmm4
+		
+		paddd		xmm1, xmm4
+		psrad		xmm0, 10
+		
+		paddd		xmm2, xmm4
+		psrad		xmm1, 10
+		packssdw	xmm0, xmm0
+		
+		paddd		xmm3, xmm4
+		psrad		xmm2, 10
+		packssdw	xmm1, xmm1
+
+		movdqa		xmm5, [edx]
+		psrad		xmm3, 10		
+		punpcklwd	xmm0, xmm1
+
+		packssdw	xmm2, xmm2
+		packssdw	xmm3, xmm3
+		pshufd		xmm4, xmm5, 01000100b				
+
+		pmaddwd		xmm0, xmm4
+		punpcklwd	xmm2, xmm3
+
+		pshufd		xmm5, xmm5, 11101110b
+		
+		pmaddwd		xmm2, xmm5
+		paddd		xmm0, xmm2
+		paddd		xmm0, [round2]
+		psrad		xmm0, 18
+
+		packssdw	xmm0, xmm0
+%endmacro
+
+		;fetch mipmap 1
+		.SETUPADDR	0
+		.VCUBIC		[ebx+mipmap.pitch]
+		
+		movq		[esp + .af_vtemp0], xmm0
+
+		;fetch mipmap 2		
+		.SETUPADDR	1
+		.VCUBIC		[ebx+mipmap.pitch]
+		
+		;blend mips
+		movq		xmm1, [esp + .af_vtemp0]
+		
+		psubw		xmm0, xmm1
+
+		movd		xmm3, [edi+mipspan.lambda]
+		pshuflw		xmm3, xmm3, 0
+		psllw		xmm3, 8
+		psrlq		xmm3, 1
+		
+		paddw		xmm0, xmm0
+		pmulhw		xmm0, xmm3
+		paddw		xmm0, xmm1
+		packuswb	xmm0, xmm0
+
+		movd		dword [ebp+esi], xmm0
+		add			edi, mipspan_size
+		add			esi,4
+		jnc			.xloop
+
+		mov			esp, [esp + .af_prevesp]
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+
+		end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp
new file mode 100644
index 000000000..a292ca2bd
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp
@@ -0,0 +1,76 @@
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+
+void VDPixmapBltAlphaConst8(uint8 *dst, ptrdiff_t dstpitch, const uint8 *src, ptrdiff_t srcpitch, uint32 w, uint32 h, uint32 ialpha);
+
+bool VDPixmapBltAlphaConst(const VDPixmap& dst, const VDPixmap& src, float alpha) {
+	if (!(alpha >= 0.0f))
+		alpha = 0.0f;
+	else if (!(alpha <= 1.0f))
+		alpha = 1.0f;
+
+	uint32 ialpha = VDRoundToInt32(alpha * 256.0f);
+
+	// format check
+	if (dst.format != src.format || !src.format)
+		return false;
+
+	// degenerate case check
+	if (!dst.w || !dst.h)
+		return false;
+
+	// size check
+	if (src.w != dst.w || src.h != dst.h)
+		return false;
+
+	// check for formats that are not 8bpp
+	switch(src.format) {
+		case nsVDPixmap::kPixFormat_Pal1:
+		case nsVDPixmap::kPixFormat_Pal2:
+		case nsVDPixmap::kPixFormat_Pal4:
+		case nsVDPixmap::kPixFormat_Pal8:
+		case nsVDPixmap::kPixFormat_RGB565:
+		case nsVDPixmap::kPixFormat_XRGB1555:
+			return false;
+	}
+
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(src.format);
+
+	const int qw = -(-dst.w >> formatInfo.qwbits);
+	const int qh = -(-dst.h >> formatInfo.qhbits);
+	const int auxw = -(-dst.w >> formatInfo.auxwbits);
+	const int auxh = -(-dst.h >> formatInfo.auxhbits);
+
+	switch(formatInfo.auxbufs) {
+	case 2:
+		VDPixmapBltAlphaConst8((uint8 *)dst.data3, dst.pitch3, (const uint8 *)src.data3, src.pitch3, auxw, auxh, ialpha);
+	case 1:
+		VDPixmapBltAlphaConst8((uint8 *)dst.data2, dst.pitch2, (const uint8 *)src.data2, src.pitch2, auxw, auxh, ialpha);
+	case 0:
+		VDPixmapBltAlphaConst8((uint8 *)dst.data, dst.pitch, (const uint8 *)src.data, src.pitch, formatInfo.qsize * qw, qh, ialpha);
+	}
+
+	return true;
+}
+
+void VDPixmapBltAlphaConst8(uint8 *dst, ptrdiff_t dstpitch, const uint8 *src, ptrdiff_t srcpitch, uint32 w, uint32 h, uint32 ialpha) {
+	dstpitch -= w;
+	srcpitch -= w;
+	do {
+		uint32 w2 = w;
+		do {
+			sint32 sc = *src;
+			sint32 dc = *dst;
+
+			*dst = dc + (((sc-dc)*ialpha + 128) >> 8);
+			++src;
+			++dst;
+		} while(--w2);
+
+		src += srcpitch;
+		dst += dstpitch;
+	} while(--h);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp
new file mode 100644
index 000000000..75e5542a9
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp
@@ -0,0 +1,273 @@
+#include <vector>
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+
+#if _MSC_VER >= 1300
+	#define VDNOINLINE __declspec(noinline)
+#else
+	#define VDNOINLINE
+#endif
+
+using namespace nsVDPixmap;
+
+namespace {
+	typedef void (*tpPalettedBlitter)(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal);
+	typedef void (*tpChunkyBlitter)(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+	typedef void (*tpPlanarBlitter)(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+}
+
+bool VDPixmapBltDirect(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+void VDPixmapBltDirectPalettedConversion(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h, tpPalettedBlitter pBlitter) {
+	uint8 palbytes[256 * 3];
+
+	int palsize;
+
+	switch(src.format) {
+	case kPixFormat_Pal1:
+		palsize = 2;
+		break;
+	case kPixFormat_Pal2:
+		palsize = 4;
+		break;
+	case kPixFormat_Pal4:
+		palsize = 16;
+		break;
+	case kPixFormat_Pal8:
+		palsize = 256;
+		break;
+	default:
+		VDNEVERHERE;
+	}
+
+	VDASSERT(src.palette);
+
+	VDPixmap srcpal = { (void *)src.palette, NULL, palsize, 1, 0, kPixFormat_XRGB8888 };
+	VDPixmap dstpal = { palbytes, NULL, palsize, 1, 0, dst.format };
+
+	VDVERIFY(VDPixmapBltDirect(dstpal, srcpal, palsize, 1));
+
+	pBlitter(dst.data, dst.pitch, src.data, src.pitch, w, h, palbytes);
+}
+
+tpVDPixBltTable VDPixmapGetBlitterTable() {
+#if defined(_WIN32) && defined(_M_IX86)
+	static tpVDPixBltTable pBltTable;
+	
+	if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX) {
+		return VDGetPixBltTableX86MMX();
+	} else {
+		return VDGetPixBltTableX86Scalar();
+	}
+#else
+	static tpVDPixBltTable pBltTable = VDGetPixBltTableReference();
+	return pBltTable;
+#endif
+}
+
+bool VDPixmapBltDirect(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+	if ((unsigned)src.format >= kPixFormat_Max_Standard) {
+		VDASSERT(false);
+		return false;
+	}
+
+	if ((unsigned)dst.format >= kPixFormat_Max_Standard) {
+		VDASSERT(false);
+		return false;
+	}
+
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+
+	if (src.format == dst.format) {
+		int qw = w;
+		int qh = h;
+
+		if (srcinfo.qchunky) {
+			qw = (qw + srcinfo.qw - 1) / srcinfo.qw;
+			qh = -(-h >> srcinfo.qhbits);
+		}
+
+		const int auxw = -(-w >> srcinfo.auxwbits);
+		const int auxh = -(-h >> srcinfo.auxhbits);
+
+		switch(srcinfo.auxbufs) {
+		case 2:
+			VDMemcpyRect(dst.data3, dst.pitch3, src.data3, src.pitch3, srcinfo.auxsize * auxw, auxh);
+		case 1:
+			VDMemcpyRect(dst.data2, dst.pitch2, src.data2, src.pitch2, srcinfo.auxsize * auxw, auxh);
+		case 0:
+			VDMemcpyRect(dst.data, dst.pitch, src.data, src.pitch, srcinfo.qsize * qw, qh);
+		}
+
+		return true;
+	}
+
+	VDPixmapBlitterFn pBlitter = VDPixmapGetBlitterTable()[src.format][dst.format];
+
+	if (!pBlitter)
+		return false;
+
+	pBlitter(dst, src, w, h);
+	return true;
+}
+
+bool VDPixmapIsBltPossible(int dst_format, int src_format) {
+	if (src_format == dst_format)
+		return true;
+
+	tpVDPixBltTable tab(VDPixmapGetBlitterTable());
+
+	if (tab[src_format][dst_format])
+		return true;
+
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src_format);
+	const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dst_format);
+
+	if (srcinfo.auxbufs > 0 || dstinfo.auxbufs > 0)
+		return false;		// fail, planar buffers involved (can't do scanlines independently)
+
+	return 	  (tab[src_format][kPixFormat_YUV444_XVYU] && tab[kPixFormat_YUV444_XVYU][dst_format])
+			||(tab[src_format][kPixFormat_XRGB8888] && tab[kPixFormat_XRGB8888][dst_format]);
+}
+
+bool VDNOINLINE VDPixmapBltTwoStage(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+	const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dst.format);
+
+	if (srcinfo.auxbufs > 0 || dstinfo.auxbufs > 0)
+		return false;		// fail, planar buffers involved
+
+	if (srcinfo.qh > 1)
+		return false;		// fail, vertically packed formats involved
+
+	if (srcinfo.palsize)
+		return false;		// fail, paletted formats involved
+
+	// Allocate a 4xW buffer and try round-tripping through either
+	// RGB32 or XYVU.
+	vdblock<uint32>		tempBuf;
+	
+	tpVDPixBltTable tab(VDPixmapGetBlitterTable());
+
+	VDPixmap linesrc(src);
+	VDPixmap linedst(dst);
+	VDPixmap linetmp = {};
+
+	if (w < 1024) {
+		linetmp.data = _alloca(sizeof(uint32) * w);
+	} else {
+		tempBuf.resize(w + 1);
+		linetmp.data = tempBuf.data();
+	}
+	linetmp.pitch = 0;
+	linetmp.format = kPixFormat_YUV444_XVYU;
+	linetmp.w = w;
+	linetmp.h = 1;
+
+	VDPixmapBlitterFn pb1 = tab[src.format][kPixFormat_YUV444_XVYU];
+	VDPixmapBlitterFn pb2 = tab[kPixFormat_YUV444_XVYU][dst.format];
+	if (!pb1 || !pb2) {
+		pb1 = tab[src.format][kPixFormat_XRGB8888];
+		pb2 = tab[kPixFormat_XRGB8888][dst.format];
+		if (!pb1 || !pb2)
+			return false;
+
+		linetmp.format = kPixFormat_XRGB8888;
+	}
+
+	do {
+		pb1(linetmp, linesrc, w, 1);
+		pb2(linedst, linetmp, w, 1);
+		vdptrstep(linesrc.data, linesrc.pitch);
+		vdptrstep(linedst.data, linedst.pitch);
+	} while(--h);
+	return true;
+}
+
+bool VDPixmapBltFast(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+	if (VDPixmapBltDirect(dst, src, w, h))
+		return true;
+
+	// Oro... let's see if we can do a two-stage conversion.
+	return VDPixmapBltTwoStage(dst, src, w, h);
+}
+
+bool VDPixmapBlt(const VDPixmap& dst, const VDPixmap& src) {
+	vdpixsize w = std::min<vdpixsize>(src.w, dst.w);
+	vdpixsize h = std::min<vdpixsize>(src.h, dst.h);
+
+	if (!w || !h)
+		return true;
+
+	return VDPixmapBltFast(dst, src, w, h);
+}
+
+bool VDPixmapBlt(const VDPixmap& dst, vdpixpos x1, vdpixpos y1, const VDPixmap& src, vdpixpos x2, vdpixpos y2, vdpixsize w, vdpixsize h) {
+	if (x1 < 0) {
+		x2 -= x1;
+		w -= x1;
+		x1 = 0;
+	}
+
+	if (y1 < 0) {
+		y2 -= y1;
+		h -= y1;
+		y1 = 0;
+	}
+
+	if (x2 < 0) {
+		x1 -= x2;
+		w -= x2;
+		x2 = 0;
+	}
+
+	if (y2 < 0) {
+		y1 -= y2;
+		h -= y2;
+		y2 = 0;
+	}
+
+	if (w > dst.w - x1)
+		w = dst.w - x1;
+
+	if (h > dst.h - y1)
+		h = dst.h - y1;
+
+	if (w > src.w - x2)
+		w = src.w - x2;
+
+	if (h > src.h - y2)
+		h = src.h - y2;
+
+	if (w>=0 && h >= 0) {
+		VDPixmap dst2(VDPixmapOffset(dst, x1, y1));
+		VDPixmap src2(VDPixmapOffset(src, x2, y2));
+
+		return VDPixmapBltFast(dst2, src2, w, h);
+	}
+
+	return true;
+}
+
+extern bool VDPixmapStretchBltNearest_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2);
+extern bool VDPixmapStretchBltBilinear_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2);
+
+bool VDPixmapStretchBltNearest(const VDPixmap& dst, const VDPixmap& src) {
+	return VDPixmapStretchBltNearest(dst, 0, 0, dst.w<<16, dst.h<<16, src, 0, 0, src.w<<16, src.h<<16);
+}
+
+bool VDPixmapStretchBltNearest(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+	return VDPixmapStretchBltNearest_reference(dst, x1, y1, x2, y2, src, u1, v1, u2, v2);
+}
+
+bool VDPixmapStretchBltBilinear(const VDPixmap& dst, const VDPixmap& src) {
+	return VDPixmapStretchBltBilinear(dst, 0, 0, dst.w<<16, dst.h<<16, src, 0, 0, src.w<<16, src.h<<16);
+}
+
+bool VDPixmapStretchBltBilinear(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+	return VDPixmapStretchBltBilinear_reference(dst, x1, y1, x2, y2, src, u1, v1, u2, v2);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp
new file mode 100644
index 000000000..c4dccce9f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp
@@ -0,0 +1,259 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_setup.h"
+
+#define DECLARE_PALETTED(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0)
+#define DECLARE_RGB(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+DECLARE_RGB(RGB565,	  XRGB1555);
+DECLARE_RGB(RGB888,   XRGB1555);
+DECLARE_RGB(XRGB8888, XRGB1555);
+DECLARE_RGB(XRGB1555, RGB565);
+DECLARE_RGB(RGB888,   RGB565);
+DECLARE_RGB(XRGB8888, RGB565);
+DECLARE_RGB(XRGB1555, RGB888);
+DECLARE_RGB(RGB565,   RGB888);
+DECLARE_RGB(XRGB8888, RGB888);
+DECLARE_RGB(XRGB1555, XRGB8888);
+DECLARE_RGB(RGB565,   XRGB8888);
+DECLARE_RGB(RGB888,   XRGB8888);
+
+DECLARE_PALETTED(Pal1, Any8);
+DECLARE_PALETTED(Pal1, Any16);
+DECLARE_PALETTED(Pal1, Any24);
+DECLARE_PALETTED(Pal1, Any32);
+DECLARE_PALETTED(Pal2, Any8);
+DECLARE_PALETTED(Pal2, Any16);
+DECLARE_PALETTED(Pal2, Any24);
+DECLARE_PALETTED(Pal2, Any32);
+DECLARE_PALETTED(Pal4, Any8);
+DECLARE_PALETTED(Pal4, Any16);
+DECLARE_PALETTED(Pal4, Any24);
+DECLARE_PALETTED(Pal4, Any32);
+DECLARE_PALETTED(Pal8, Any8);
+DECLARE_PALETTED(Pal8, Any16);
+DECLARE_PALETTED(Pal8, Any24);
+DECLARE_PALETTED(Pal8, Any32);
+
+DECLARE_YUV(XVYU, UYVY);
+DECLARE_YUV(XVYU, YUYV);
+DECLARE_YUV(Y8, UYVY);
+DECLARE_YUV(Y8, YUYV);
+DECLARE_YUV(UYVY, Y8);
+DECLARE_YUV(YUYV, Y8);
+DECLARE_YUV(UYVY, YUYV);
+DECLARE_YUV_PLANAR(YUV411, YV12);
+
+DECLARE_YUV(UYVY, XRGB1555);
+DECLARE_YUV(UYVY, RGB565);
+DECLARE_YUV(UYVY, RGB888);
+DECLARE_YUV(UYVY, XRGB8888);
+DECLARE_YUV(YUYV, XRGB1555);
+DECLARE_YUV(YUYV, RGB565);
+DECLARE_YUV(YUYV, RGB888);
+DECLARE_YUV(YUYV, XRGB8888);
+DECLARE_YUV(Y8, XRGB1555);
+DECLARE_YUV(Y8, RGB565);
+DECLARE_YUV(Y8, RGB888);
+DECLARE_YUV(Y8, XRGB8888);
+
+DECLARE_YUV_REV(XRGB1555, Y8);
+DECLARE_YUV_REV(RGB565,   Y8);
+DECLARE_YUV_REV(RGB888,   Y8);
+DECLARE_YUV_REV(XRGB8888, Y8);
+
+DECLARE_YUV_REV(XRGB1555, XVYU);
+DECLARE_YUV_REV(RGB565,   XVYU);
+DECLARE_YUV_REV(RGB888,   XVYU);
+DECLARE_YUV_REV(XRGB8888, XVYU);
+
+DECLARE_YUV_PLANAR(YV12, XRGB1555);
+DECLARE_YUV_PLANAR(YV12, RGB565);
+DECLARE_YUV_PLANAR(YV12, RGB888);
+DECLARE_YUV_PLANAR(YV12, XRGB8888);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555);
+DECLARE_YUV_PLANAR(YUV411, RGB565);
+DECLARE_YUV_PLANAR(YUV411, RGB888);
+DECLARE_YUV_PLANAR(YUV411, XRGB8888);
+
+extern void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_UberblitAdapter(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+using namespace nsVDPixmap;
+
+void VDPixmapInitBlittersReference(VDPixmapBlitterTable& table) {
+	// use uberblit as the baseline
+	VDPixmapFormatSubset uberblitSrcFormats;
+	VDPixmapFormatSubset uberblitDstFormats;
+
+	uberblitSrcFormats =
+		kPixFormat_Pal1,
+		kPixFormat_Pal2,
+		kPixFormat_Pal4,
+		kPixFormat_Pal8,
+		kPixFormat_XRGB1555,
+		kPixFormat_RGB565,
+		kPixFormat_RGB888,
+		kPixFormat_XRGB8888,
+		kPixFormat_Y8,
+		kPixFormat_YUV422_UYVY,
+		kPixFormat_YUV422_YUYV,
+		kPixFormat_YUV444_XVYU,
+		kPixFormat_YUV444_Planar,
+		kPixFormat_YUV422_Planar,
+		kPixFormat_YUV422_Planar_16F,
+		kPixFormat_YUV420_Planar,
+		kPixFormat_YUV411_Planar,
+		kPixFormat_YUV410_Planar,
+		kPixFormat_YUV422_Planar_Centered,
+		kPixFormat_YUV420_Planar_Centered,
+		kPixFormat_YUV422_V210,
+		kPixFormat_YUV422_UYVY_709,
+		kPixFormat_YUV420_NV12;
+
+	uberblitDstFormats =
+		kPixFormat_XRGB1555,
+		kPixFormat_RGB565,
+		kPixFormat_RGB888,
+		kPixFormat_XRGB8888,
+		kPixFormat_Y8,
+		kPixFormat_YUV422_UYVY,
+		kPixFormat_YUV422_YUYV,
+		kPixFormat_YUV444_XVYU,
+		kPixFormat_YUV444_Planar,
+		kPixFormat_YUV422_Planar,
+		kPixFormat_YUV422_Planar_16F,
+		kPixFormat_YUV420_Planar,
+		kPixFormat_YUV411_Planar,
+		kPixFormat_YUV410_Planar,
+		kPixFormat_YUV422_Planar_Centered,
+		kPixFormat_YUV420_Planar_Centered,
+		kPixFormat_YUV422_V210,
+		kPixFormat_YUV422_UYVY_709,
+		kPixFormat_YUV420_NV12;
+
+	table.AddBlitter(uberblitSrcFormats, uberblitDstFormats, VDPixmapBlt_UberblitAdapter);
+
+	// standard formats
+
+	table.AddBlitter(kPixFormat_Pal1,	kPixFormat_Y8,			VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any8_reference>);
+	table.AddBlitter(kPixFormat_Pal1,	kPixFormat_XRGB1555,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal1,	kPixFormat_RGB565,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal1,	kPixFormat_RGB888,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any24_reference>);
+	table.AddBlitter(kPixFormat_Pal1,	kPixFormat_XRGB8888,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any32_reference>);
+	table.AddBlitter(kPixFormat_Pal2,	kPixFormat_Y8,			VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any8_reference>);
+	table.AddBlitter(kPixFormat_Pal2,	kPixFormat_XRGB1555,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal2,	kPixFormat_RGB565,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal2,	kPixFormat_RGB888,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any24_reference>);
+	table.AddBlitter(kPixFormat_Pal2,	kPixFormat_XRGB8888,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any32_reference>);
+	table.AddBlitter(kPixFormat_Pal4,	kPixFormat_Y8,			VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any8_reference>);
+	table.AddBlitter(kPixFormat_Pal4,	kPixFormat_XRGB1555,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal4,	kPixFormat_RGB565,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal4,	kPixFormat_RGB888,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any24_reference>);
+	table.AddBlitter(kPixFormat_Pal4,	kPixFormat_XRGB8888,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any32_reference>);
+	table.AddBlitter(kPixFormat_Pal8,	kPixFormat_Y8,			VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any8_reference>);
+	table.AddBlitter(kPixFormat_Pal8,	kPixFormat_XRGB1555,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal8,	kPixFormat_RGB565,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any16_reference>);
+	table.AddBlitter(kPixFormat_Pal8,	kPixFormat_RGB888,		VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any24_reference>);
+	table.AddBlitter(kPixFormat_Pal8,	kPixFormat_XRGB8888,	VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any32_reference>);
+
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_RGB888_reference>);
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_XRGB8888_reference>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_RGB888_reference>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XRGB8888_reference>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XRGB8888_reference>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_RGB888_reference>);
+
+	table.AddBlitter(kPixFormat_YUV444_XVYU,	kPixFormat_YUV422_UYVY,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XVYU_to_UYVY_reference>);
+	table.AddBlitter(kPixFormat_YUV444_XVYU,	kPixFormat_YUV422_YUYV,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XVYU_to_YUYV_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_YUV422_UYVY,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_UYVY_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_YUV422_YUYV,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_YUYV_reference>);
+	table.AddBlitter(kPixFormat_YUV422_UYVY,	kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_Y8_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV,	kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_Y8_reference>);
+
+	table.AddBlitter(kPixFormat_YUV422_UYVY,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_YUV422_UYVY,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_YUV422_UYVY,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_RGB888_reference>);
+	table.AddBlitter(kPixFormat_YUV422_UYVY,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_XRGB8888_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_RGB888_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_XRGB8888_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_XRGB1555_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_RGB565_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_RGB888_reference>);
+	table.AddBlitter(kPixFormat_Y8,				kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_XRGB8888_reference>);
+
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_XVYU_reference>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XVYU_reference>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XVYU_reference>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_XVYU_reference>);
+
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_Y8_reference>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_Y8_reference>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_Y8_reference>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_Y8,			VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_Y8_reference>);
+
+	table.AddBlitter(kPixFormat_YUV411_Planar, kPixFormat_YUV420_Planar, VDPixmapBlt_YUV411_to_YV12_reference);
+
+	table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_YUYV_reference>);
+	table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_YUV422_UYVY, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_YUYV_reference>);		// not an error -- same routine
+
+	//////////////////////////////////////////////////////////
+
+	VDPixmapFormatSubset srcFormats;
+	VDPixmapFormatSubset dstFormats;
+
+	srcFormats = kPixFormat_YUV444_Planar,
+				kPixFormat_YUV422_Planar,
+				kPixFormat_YUV420_Planar,
+				kPixFormat_YUV411_Planar,
+				kPixFormat_YUV410_Planar,
+				kPixFormat_YUV422_Planar_Centered,
+				kPixFormat_YUV420_Planar_Centered;
+
+	dstFormats = kPixFormat_XRGB1555, kPixFormat_RGB565, kPixFormat_RGB888, kPixFormat_XRGB8888, kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV;
+
+	table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_decode_reference);
+
+	//////////////////////////////////////////////////////////
+
+	dstFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+	srcFormats = kPixFormat_XRGB1555, kPixFormat_RGB565, kPixFormat_RGB888, kPixFormat_XRGB8888, kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV;
+
+	table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_encode_reference);
+
+	//////////////////////////////////////////////////////////
+
+	srcFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_Y8, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+	dstFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_Y8, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+
+	table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_convert_reference);
+}
+
+tpVDPixBltTable VDGetPixBltTableReferenceInternal() {
+	static VDPixmapBlitterTable sReferenceTable;
+
+	VDPixmapInitBlittersReference(sReferenceTable);
+
+	return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableReference() {
+	static tpVDPixBltTable spTable = VDGetPixBltTableReferenceInternal();
+
+	return spTable;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp
new file mode 100644
index 000000000..4a103de3b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp
@@ -0,0 +1,545 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#define DECLARE_PALETTED(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0)
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: Pal1 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal1, Any8) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 3;
+	dst += (w-1) & ~7;
+
+	srcpitch += (w+7) >> 3;
+	dstpitch += (w+7) & ~7;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> ((-wt) & 7);
+		
+		switch(wt & 7) {
+			do {
+				v = src[0];
+
+		case 0:	dst[7] = pal[v&1];	v >>= 1;
+		case 7:	dst[6] = pal[v&1];	v >>= 1;
+		case 6:	dst[5] = pal[v&1];	v >>= 1;
+		case 5:	dst[4] = pal[v&1];	v >>= 1;
+		case 4:	dst[3] = pal[v&1];	v >>= 1;
+		case 3:	dst[2] = pal[v&1];	v >>= 1;
+		case 2:	dst[1] = pal[v&1];	v >>= 1;
+		case 1:	dst[0] = pal[v&1];	v >>= 1;
+
+				dst -= 8;
+				--src;
+			} while((wt -= 8) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any16) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+	const uint16 *pal = (const uint16 *)pal0;
+
+	src += (w-1) >> 3;
+	dst += (w-1) & ~7;
+
+	srcpitch += (w+7) >> 3;
+	dstpitch += ((w+7) & ~7) * 2;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> ((-wt) & 7);
+		
+		switch(wt & 7) {
+			do {
+				v = src[0];
+
+		case 0:	dst[7] = pal[v&1];	v >>= 1;
+		case 7:	dst[6] = pal[v&1];	v >>= 1;
+		case 6:	dst[5] = pal[v&1];	v >>= 1;
+		case 5:	dst[4] = pal[v&1];	v >>= 1;
+		case 4:	dst[3] = pal[v&1];	v >>= 1;
+		case 3:	dst[2] = pal[v&1];	v >>= 1;
+		case 2:	dst[1] = pal[v&1];	v >>= 1;
+		case 1:	dst[0] = pal[v&1];	v >>= 1;
+
+				dst -= 8;
+				--src;
+			} while((wt -= 8) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any24) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 3;
+	dst += ((w-1) & ~7) * 3;
+
+	srcpitch += (w+7) >> 3;
+	dstpitch += ((w+7) & ~7) * 3;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> ((-wt) & 7);
+		const uint8 *pe;
+		
+		switch(wt & 7) {
+			do {
+				v = src[0];
+
+		case 0:	pe = &pal[3*(v&1)]; dst[7*3+0] = pe[0]; dst[7*3+1] = pe[1]; dst[7*3+2] = pe[2]; v >>= 1;
+		case 7:	pe = &pal[3*(v&1)]; dst[6*3+0] = pe[0]; dst[6*3+1] = pe[1]; dst[6*3+2] = pe[2]; v >>= 1;
+		case 6:	pe = &pal[3*(v&1)]; dst[5*3+0] = pe[0]; dst[5*3+1] = pe[1]; dst[5*3+2] = pe[2]; v >>= 1;
+		case 5:	pe = &pal[3*(v&1)]; dst[4*3+0] = pe[0]; dst[4*3+1] = pe[1]; dst[4*3+2] = pe[2]; v >>= 1;
+		case 4:	pe = &pal[3*(v&1)]; dst[3*3+0] = pe[0]; dst[3*3+1] = pe[1]; dst[3*3+2] = pe[2]; v >>= 1;
+		case 3:	pe = &pal[3*(v&1)]; dst[2*3+0] = pe[0]; dst[2*3+1] = pe[1]; dst[2*3+2] = pe[2]; v >>= 1;
+		case 2:	pe = &pal[3*(v&1)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 1;
+		case 1:	pe = &pal[3*(v&1)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 1;
+
+				dst -= 24;
+				--src;
+			} while((wt -= 8) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any32) {
+	const uint8 *src = (const uint8 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *pal = (const uint32 *)pal0;
+
+	src += (w-1) >> 3;
+	dst += (w-1) & ~7;
+
+	srcpitch += (w+7) >> 3;
+	dstpitch += ((w+7) & ~7) * 4;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> ((-wt) & 7);
+		
+		switch(wt & 7) {
+			do {
+				v = src[0];
+
+		case 0:	dst[7] = pal[v&1];	v >>= 1;
+		case 7:	dst[6] = pal[v&1];	v >>= 1;
+		case 6:	dst[5] = pal[v&1];	v >>= 1;
+		case 5:	dst[4] = pal[v&1];	v >>= 1;
+		case 4:	dst[3] = pal[v&1];	v >>= 1;
+		case 3:	dst[2] = pal[v&1];	v >>= 1;
+		case 2:	dst[1] = pal[v&1];	v >>= 1;
+		case 1:	dst[0] = pal[v&1];	v >>= 1;
+
+				dst -= 8;
+				--src;
+			} while((wt -= 8) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: Pal2 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal2, Any8) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 2;
+	dst += (w-1) & ~3;
+
+	srcpitch += (w+3) >> 2;
+	dstpitch += (w+3) & ~3;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 3)*2);
+		
+		switch(wt & 3) {
+			do {
+				v = src[0];
+
+		case 0:	dst[3] = pal[v&3];	v >>= 2;
+		case 3:	dst[2] = pal[v&3];	v >>= 2;
+		case 2:	dst[1] = pal[v&3];	v >>= 2;
+		case 1:	dst[0] = pal[v&3];	v >>= 2;
+
+				dst -= 4;
+				--src;
+			} while((wt -= 4) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any16) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+	const uint16 *pal = (const uint16 *)pal0;
+
+	src += (w-1) >> 2;
+	dst += (w-1) & ~3;
+
+	srcpitch += (w+3) >> 2;
+	dstpitch += ((w+3) & ~3) * 2;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 3)*2);
+		
+		switch(wt & 3) {
+			do {
+				v = src[0];
+
+		case 0:	dst[3] = pal[v&3];	v >>= 2;
+		case 3:	dst[2] = pal[v&3];	v >>= 2;
+		case 2:	dst[1] = pal[v&3];	v >>= 2;
+		case 1:	dst[0] = pal[v&3];	v >>= 2;
+
+				dst -= 4;
+				--src;
+			} while((wt -= 4) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any24) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 2;
+	dst += ((w-1) & ~3) * 3;
+
+	srcpitch += (w+3) >> 2;
+	dstpitch += ((w+3) & ~3) * 3;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 3)*2);
+		const uint8 *pe;
+		
+		switch(wt & 3) {
+			do {
+				v = src[0];
+
+		case 0:	pe = &pal[3*(v&3)]; dst[3*3+0] = pe[0]; dst[3*3+1] = pe[1]; dst[3*3+2] = pe[2]; v >>= 2;
+		case 3:	pe = &pal[3*(v&3)]; dst[2*3+0] = pe[0]; dst[2*3+1] = pe[1]; dst[2*3+2] = pe[2]; v >>= 2;
+		case 2:	pe = &pal[3*(v&3)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 2;
+		case 1:	pe = &pal[3*(v&3)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 2;
+
+				dst -= 12;
+				--src;
+			} while((wt -= 4) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any32) {
+	const uint8 *src = (const uint8 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *pal = (const uint32 *)pal0;
+
+	src += (w-1) >> 2;
+	dst += (w-1) & ~3;
+
+	srcpitch += (w+3) >> 2;
+	dstpitch += ((w+3) & ~3) * 4;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 3)*2);
+		
+		switch(wt & 3) {
+			do {
+				v = src[0];
+
+		case 0:	dst[3] = pal[v&3];	v >>= 2;
+		case 3:	dst[2] = pal[v&3];	v >>= 2;
+		case 2:	dst[1] = pal[v&3];	v >>= 2;
+		case 1:	dst[0] = pal[v&3];	v >>= 2;
+
+				dst -= 4;
+				--src;
+			} while((wt -= 4) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: Pal4 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal4, Any8) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 1;
+	dst += ((w-1) & ~1);
+
+	srcpitch += (w+1) >> 1;
+	dstpitch += (w+1) & ~1;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 1)*4);
+		
+		switch(wt & 1) {
+			do {
+				v = src[0];
+
+		case 0:	dst[1] = pal[v&15];	v >>= 4;
+		case 1:	dst[0] = pal[v&15];	v >>= 4;
+
+				dst -= 2;
+				--src;
+			} while((wt -= 2) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any16) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+	const uint16 *pal = (const uint16 *)pal0;
+
+	src += (w-1) >> 1;
+	dst += ((w-1) & ~1);
+
+	srcpitch += (w+1) >> 1;
+	dstpitch += ((w+1) & ~1) * 2;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 1)*4);
+		
+		switch(wt & 1) {
+			do {
+				v = src[0];
+
+		case 0:	dst[1] = pal[v&15];	v >>= 4;
+		case 1:	dst[0] = pal[v&15];	v >>= 4;
+
+				dst -= 2;
+				--src;
+			} while((wt -= 2) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any24) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	src += (w-1) >> 1;
+	dst += ((w-1) & ~1) * 3;
+
+	srcpitch += (w+1) >> 1;
+	dstpitch += ((w+1) & ~1) * 3;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 1)*4);
+		const uint8 *pe;
+		
+		switch(wt & 1) {
+			do {
+				v = src[0];
+
+		case 0:	pe = &pal[3*(v&15)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 4;
+		case 1:	pe = &pal[3*(v&15)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 4;
+
+				dst -= 6;
+				--src;
+			} while((wt -= 2) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any32) {
+	const uint8 *src = (const uint8 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *pal = (const uint32 *)pal0;
+
+	src += (w-1) >> 1;
+	dst += ((w-1) & ~1);
+
+	srcpitch += (w+1) >> 1;
+	dstpitch += ((w+1) & ~1) * 4;
+
+	do {
+		int wt = w;
+
+		uint8 v = src[0] >> (((-wt) & 1)*4);
+		
+		switch(wt & 1) {
+			do {
+				v = src[0];
+
+		case 0:	dst[1] = pal[v&15];	v >>= 4;
+		case 1:	dst[0] = pal[v&15];	v >>= 4;
+
+				dst -= 2;
+				--src;
+			} while((wt -= 2) > 0);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: Pal8 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal8, Any8) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	srcpitch -= w;
+	dstpitch -= w;
+
+	do {
+		int wt = w;
+
+		do {
+			*dst++ = pal[*src++];
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any16) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+	const uint16 *pal = (const uint16 *)pal0;
+
+	srcpitch -= w;
+	dstpitch -= w*2;
+
+	do {
+		int wt = w;
+
+		do {
+			*dst++ = pal[*src++];
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any24) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *pal = (const uint8 *)pal0;
+
+	srcpitch -= w;
+	dstpitch -= w*3;
+
+	do {
+		int wt = w;
+		do {
+			const uint8 *pe = &pal[3**src++];
+
+			dst[0] = pe[0];
+			dst[1] = pe[1];
+			dst[2] = pe[2];
+			dst += 3;
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any32) {
+	const uint8 *src = (const uint8 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *pal = (const uint32 *)pal0;
+
+	srcpitch -= w;
+	dstpitch -= w*4;
+
+	do {
+		int wt = w;
+
+		do {
+			*dst++ = pal[*src++];
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp
new file mode 100644
index 000000000..ea49f260d
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp
@@ -0,0 +1,310 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#define DECLARE_RGB(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: -> XRGB1555
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(RGB565, XRGB1555) {
+	const uint16 *src = (const uint16 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			*dst++ = (px&0x001f) + ((px&0xffc0)>>1);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(RGB888, XRGB1555) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 3*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 r = ((uint32)src[2] & 0xf8) << 7;
+			const uint32 g = ((uint32)src[1] & 0xf8) << 2;
+			const uint32 b = (uint32)src[0] >> 3;
+			src += 3;
+
+			*dst++ = (uint16)(r + g + b);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(XRGB8888, XRGB1555) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 4*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 r = ((uint32)src[2] & 0xf8) << 7;
+			const uint32 g = ((uint32)src[1] & 0xf8) << 2;
+			const uint32 b = (uint32)src[0] >> 3;
+			src += 4;
+
+			*dst++ = (uint16)(r + g + b);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: -> RGB565
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, RGB565) {
+	const uint16 *src = (const uint16 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			*dst++ = (uint16)(px + (px&0xffe0) + ((px&0x0200)>>4));
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(RGB888, RGB565) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 3*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 r = ((uint32)src[2] & 0xf8) << 8;
+			const uint32 g = ((uint32)src[1] & 0xfc) << 3;
+			const uint32 b = (uint32)src[0] >> 3;
+			src += 3;
+
+			*dst++ = (uint16)(r + g + b);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(XRGB8888, RGB565) {
+	const uint8 *src = (const uint8 *)src0;
+	uint16 *dst = (uint16 *)dst0;
+
+	srcpitch -= 4*w;
+	dstpitch -= 2*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 r = ((uint32)src[2] & 0xf8) << 8;
+			const uint32 g = ((uint32)src[1] & 0xfc) << 3;
+			const uint32 b = (uint32)src[0] >> 3;
+			src += 4;
+
+			*dst++ = (uint16)(r + g + b);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: -> RGB888
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, RGB888) {
+	const uint16 *src = (const uint16 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 3*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			uint32 rb = px & 0x7c1f;
+			uint32 g = px & 0x03e0;
+
+			rb += rb<<5;
+			g += g<<5;
+
+			dst[0] = (uint8)(rb>>2);
+			dst[1] = (uint8)(g>>7);
+			dst[2] = (uint8)(rb>>12);
+			dst += 3;
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(RGB565, RGB888) {
+	const uint16 *src = (const uint16 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 3*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			uint32 rb = px & 0xf81f;
+			uint32 g = px & 0x07e0;
+
+			rb += rb<<5;
+			g += g<<6;
+
+			dst[0] = (uint8)(rb>>2);
+			dst[1] = (uint8)(g>>9);
+			dst[2] = (uint8)(rb>>13);
+			dst += 3;
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(XRGB8888, RGB888) {
+	const uint8 *src = (const uint8 *)src0;
+	uint8 *dst = (uint8 *)dst0;
+
+	srcpitch -= 4*w;
+	dstpitch -= 3*w;
+
+	do {
+		int wt = w;
+
+		do {
+			dst[0] = src[0];
+			dst[1] = src[1];
+			dst[2] = src[2];
+			dst += 3;
+			src += 4;
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+//	RGB blitters: -> XRGB8888
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, XRGB8888) {
+	const uint16 *src = (const uint16 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 4*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			const uint32 rgb = ((px & 0x7c00) << 9) + ((px & 0x03e0) << 6) + ((px & 0x001f) << 3);
+
+			*dst++ = rgb + ((rgb & 0xe0e0e0)>>5);
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(RGB565, XRGB8888) {
+	const uint16 *src = (const uint16 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+
+	srcpitch -= 2*w;
+	dstpitch -= 4*w;
+
+	do {
+		int wt = w;
+
+		do {
+			const uint32 px = *src++;
+			const uint32 rb = ((px & 0xf800) << 8) + ((px & 0x001f) << 3);
+			const uint32 g = ((px & 0x07e0) << 5) + (px & 0x0300);
+
+			*dst++ = rb + ((rb & 0xe000e0)>>5) + g;
+		} while(--wt);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_RGB(RGB888, XRGB8888) {
+	const uint8 *src = (const uint8 *)src0;
+	uint32 *dst = (uint32 *)dst0;
+
+	srcpitch -= 3*w;
+	dstpitch -= 4*w;
+
+	do {
+		int wt = w;
+
+		do {
+			*dst++ = (uint32)src[0] + ((uint32)src[1]<<8) + ((uint32)src[2]<<16);
+			src += 3;
+		} while(--wt);
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp
new file mode 100644
index 000000000..6f40eeaa0
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp
@@ -0,0 +1,1590 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/memory.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#include "blt_spanutils.h"
+
+#ifdef _M_IX86
+	#include "blt_spanutils_x86.h"
+#endif
+
+using namespace nsVDPixmapSpanUtils;
+
+namespace {
+	struct YCbCrToRGB {
+		sint16 y_tab[256];
+		sint16 r_cr_tab[256];
+		sint16 b_cb_tab[256];
+		sint16 g_cr_tab[256];
+		sint16 g_cb_tab[256];
+		uint8 cliptab[277+256+279];
+		uint16 cliptab15[277+256+279];
+		uint16 cliptab16[277+256+279];
+
+		YCbCrToRGB() {
+			int i;
+
+			memset(cliptab, 0, 277);
+			memset(cliptab+277+256, 255, 279);
+
+			memset(cliptab15, 0, sizeof cliptab15[0] * 277);
+			memset(cliptab16, 0, sizeof cliptab16[0] * 277);
+			memset(cliptab15+277+256, 0xff, sizeof cliptab15[0] * 279);
+			memset(cliptab16+277+256, 0xff, sizeof cliptab16[0] * 279);
+
+			for(i=0; i<256; ++i) {
+				y_tab[i] = (sint16)(((i-16) * 76309 + 32768) >> 16);
+				r_cr_tab[i] = (sint16)(((i-128) * 104597 + 32768) >> 16);
+				b_cb_tab[i] = (sint16)(((i-128) * 132201 + 32768) >> 16);
+				g_cr_tab[i] = (sint16)(((i-128) * -53279 + 32768) >> 16);
+				g_cb_tab[i] = (sint16)(((i-128) * -25674 + 32768) >> 16);
+				cliptab[i+277] = (uint8)i;
+				cliptab15[i+277] = 0x421 * ((unsigned)i>>3);
+				cliptab16[i+277] = 0x801 * ((unsigned)i>>3) + 0x20 * ((unsigned)i>>2);
+			}
+		}
+	} colorconv;
+
+	struct YCbCrFormatInfo {
+		ptrdiff_t	ystep;
+		ptrdiff_t	cstep;
+		ptrdiff_t	yinc[4];
+		ptrdiff_t	cinc[4];
+		sint8		ypos[4];
+		sint8		cbpos[4];
+		sint8		crpos[4];
+	};
+
+	YCbCrFormatInfo		g_formatInfo_YUV444_Planar	= { -4, -4, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,1,2,3}, {0,1,2,3}, {0,1,2,3}};
+	YCbCrFormatInfo		g_formatInfo_YUV422_YUYV	= { -8, -8, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,2,4,6}, {1,1,5,5}, {3,3,7,7}};
+	YCbCrFormatInfo		g_formatInfo_YUV422_UYVY	= { -8, -8, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {1,3,5,7}, {0,0,4,4}, {2,2,6,6}};
+	YCbCrFormatInfo		g_formatInfo_YUV420_YV12	= { -4, -2, {-1,-1,-1,-1}, { 0,-1, 0,-1}, {0,1,2,3}, {0,0,1,1}, {0,0,1,1}};
+	YCbCrFormatInfo		g_formatInfo_YUV411_YV12	= { -4, -1, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,1,2,3}, {0,0,0,0}, {0,0,0,0}};
+
+	inline uint16 ycbcr_to_1555(uint8 y, uint8 cb0, uint8 cr0) {
+		const uint16 *p = &colorconv.cliptab15[277 + colorconv.y_tab[y]];
+		uint32 r = 0x7c00 & p[colorconv.r_cr_tab[cr0]];
+		uint32 g = 0x03e0 & p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+		uint32 b = 0x001f & p[colorconv.b_cb_tab[cb0]];
+
+		return r + g + b;
+	}
+
+	inline uint16 ycbcr_to_565(uint8 y, uint8 cb0, uint8 cr0) {
+		const uint16 *p = &colorconv.cliptab16[277 + colorconv.y_tab[y]];
+		uint32 r = 0xf800 & p[colorconv.r_cr_tab[cr0]];
+		uint32 g = 0x07e0 & p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+		uint32 b = 0x001f & p[colorconv.b_cb_tab[cb0]];
+
+		return r + g + b;
+	}
+
+	inline void ycbcr_to_888(uint8 *dst, uint8 y, uint8 cb0, uint8 cr0) {
+		const uint8 *p = &colorconv.cliptab[277 + colorconv.y_tab[y]];
+		uint8 r = p[colorconv.r_cr_tab[cr0]];
+		uint8 g = p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+		uint8 b = p[colorconv.b_cb_tab[cb0]];
+
+		dst[0] = b;
+		dst[1] = g;
+		dst[2] = r;
+	}
+
+	inline uint32 ycbcr_to_8888(uint8 y, uint8 cb0, uint8 cr0) {
+		const uint8 *p = &colorconv.cliptab[277 + colorconv.y_tab[y]];
+		uint8 r = p[colorconv.r_cr_tab[cr0]];
+		uint8 g = p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+		uint8 b = p[colorconv.b_cb_tab[cb0]];
+
+		return (r << 16) + (g << 8) + b;
+	}
+
+	void VDYCbCrToXRGB1555Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint16 *dst = (uint16 *)dst0;
+
+		do {
+			*dst++ = ycbcr_to_1555(*y++, *cb++, *cr++);
+		} while(--w);
+	}
+
+	void VDYCbCrToRGB565Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint16 *dst = (uint16 *)dst0;
+
+		do {
+			*dst++ = ycbcr_to_565(*y++, *cb++, *cr++);
+		} while(--w);
+	}
+
+	void VDYCbCrToRGB888Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint8 *dst = (uint8 *)dst0;
+
+		do {
+			ycbcr_to_888(dst, *y++, *cb++, *cr++);
+			dst += 3;
+		} while(--w);
+	}
+
+	void VDYCbCrToXRGB8888Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint32 *dst = (uint32 *)dst0;
+
+		do {
+			*dst++ = ycbcr_to_8888(*y++, *cb++, *cr++);
+		} while(--w);
+	}
+
+	void VDYCbCrToUYVYSpan(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint32 *dst = (uint32 *)dst0;
+
+		if (--w) {
+			do {
+				*dst++ = (uint32)*cb++ + ((uint32)y[0] << 8) + ((uint32)*cr++ << 16) + ((uint32)y[1] << 24);
+				y += 2;
+			} while((sint32)(w-=2)>0);
+		}
+
+		if (!(w & 1))
+			*dst++ = (uint32)*cb + ((uint32)y[0] << 8) + ((uint32)*cr << 16) + ((uint32)y[0] << 24);
+	}
+
+	void VDYCbCrToYUYVSpan(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+		uint32 *dst = (uint32 *)dst0;
+
+		if (--w) {
+			do {
+				*dst++ = (uint32)y[0] + ((uint32)*cb++ << 8) + ((uint32)y[1] << 16) + ((uint32)*cr++ << 24);
+				y += 2;
+			} while((sint32)(w-=2)>0);
+		}
+
+		if (!(w & 1))
+			*dst++ = (uint32)y[0] + ((uint32)*cb << 8) + ((uint32)y[0] << 16) + ((uint32)*cr << 24);
+	}
+
+	void VDYCbCrToRGB1555Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+		const ptrdiff_t ystep	= formatinfo.ystep;
+		const ptrdiff_t cstep	= formatinfo.cstep;
+		const ptrdiff_t ypos0	= formatinfo.ypos[0];
+		const ptrdiff_t ypos1	= formatinfo.ypos[1];
+		const ptrdiff_t ypos2	= formatinfo.ypos[2];
+		const ptrdiff_t ypos3	= formatinfo.ypos[3];
+		const ptrdiff_t crpos0	= formatinfo.crpos[0];
+		const ptrdiff_t crpos1	= formatinfo.crpos[1];
+		const ptrdiff_t crpos2	= formatinfo.crpos[2];
+		const ptrdiff_t crpos3	= formatinfo.crpos[3];
+		const ptrdiff_t cbpos0	= formatinfo.cbpos[0];
+		const ptrdiff_t cbpos1	= formatinfo.cbpos[1];
+		const ptrdiff_t cbpos2	= formatinfo.cbpos[2];
+		const ptrdiff_t cbpos3	= formatinfo.cbpos[3];
+
+		yrow	= (char *)yrow - ystep * ((w-1) >> 2);
+		crrow	= (char *)crrow - cstep * ((w-1) >> 2);
+		cbrow	= (char *)cbrow - cstep * ((w-1) >> 2);
+		dst		= (char *)dst + 2*((w-1) & ~3);
+
+		int y = 0;
+		do {
+			const uint8 *ysrc	= (const uint8 *)yrow;
+			const uint8 *crsrc	= (const uint8 *)crrow;
+			const uint8 *cbsrc	= (const uint8 *)cbrow;
+			uint16 *out = (uint16 *)dst;
+			int w2 = -w;
+
+			switch(w2 & 3) {
+				do {
+			case 0:	out[3] = ycbcr_to_1555(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+			case 1:	out[2] = ycbcr_to_1555(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+			case 2:	out[1] = ycbcr_to_1555(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+			case 3:	out[0] = ycbcr_to_1555(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+					out -= 4;
+					ysrc += ystep;
+					crsrc += cstep;
+					cbsrc += cstep;
+				} while((w2 += 4) < 0);
+			}
+
+			dst		= (char *)dst + dststride;
+			yrow	= (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+			cbrow	= (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+			crrow	= (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+		} while(++y < h);
+	}
+
+	void VDYCbCrToRGB565Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+		const ptrdiff_t ystep	= formatinfo.ystep;
+		const ptrdiff_t cstep	= formatinfo.cstep;
+		const ptrdiff_t ypos0	= formatinfo.ypos[0];
+		const ptrdiff_t ypos1	= formatinfo.ypos[1];
+		const ptrdiff_t ypos2	= formatinfo.ypos[2];
+		const ptrdiff_t ypos3	= formatinfo.ypos[3];
+		const ptrdiff_t crpos0	= formatinfo.crpos[0];
+		const ptrdiff_t crpos1	= formatinfo.crpos[1];
+		const ptrdiff_t crpos2	= formatinfo.crpos[2];
+		const ptrdiff_t crpos3	= formatinfo.crpos[3];
+		const ptrdiff_t cbpos0	= formatinfo.cbpos[0];
+		const ptrdiff_t cbpos1	= formatinfo.cbpos[1];
+		const ptrdiff_t cbpos2	= formatinfo.cbpos[2];
+		const ptrdiff_t cbpos3	= formatinfo.cbpos[3];
+
+		yrow	= (char *)yrow - ystep * ((w-1) >> 2);
+		crrow	= (char *)crrow - cstep * ((w-1) >> 2);
+		cbrow	= (char *)cbrow - cstep * ((w-1) >> 2);
+		dst		= (char *)dst + 2*((w-1) & ~3);
+
+		int y = 0;
+		do {
+			const uint8 *ysrc = (const uint8 *)yrow;
+			const uint8 *crsrc = (const uint8 *)crrow;
+			const uint8 *cbsrc = (const uint8 *)cbrow;
+			uint16 *out = (uint16 *)dst;
+			int w2 = -w;
+
+			switch(w2 & 3) {
+				do {
+			case 0:	out[3] = ycbcr_to_565(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+			case 1:	out[2] = ycbcr_to_565(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+			case 2:	out[1] = ycbcr_to_565(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+			case 3:	out[0] = ycbcr_to_565(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+					out -= 4;
+					ysrc += ystep;
+					crsrc += cstep;
+					cbsrc += cstep;
+				} while((w2 += 4) < 0);
+			}
+
+			dst		= (char *)dst + dststride;
+			yrow	= (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+			cbrow	= (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+			crrow	= (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+		} while(++y < h);
+	}
+
+	void VDYCbCrToRGB888Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+		const ptrdiff_t ystep	= formatinfo.ystep;
+		const ptrdiff_t cstep	= formatinfo.cstep;
+		const ptrdiff_t ypos0	= formatinfo.ypos[0];
+		const ptrdiff_t ypos1	= formatinfo.ypos[1];
+		const ptrdiff_t ypos2	= formatinfo.ypos[2];
+		const ptrdiff_t ypos3	= formatinfo.ypos[3];
+		const ptrdiff_t crpos0	= formatinfo.crpos[0];
+		const ptrdiff_t crpos1	= formatinfo.crpos[1];
+		const ptrdiff_t crpos2	= formatinfo.crpos[2];
+		const ptrdiff_t crpos3	= formatinfo.crpos[3];
+		const ptrdiff_t cbpos0	= formatinfo.cbpos[0];
+		const ptrdiff_t cbpos1	= formatinfo.cbpos[1];
+		const ptrdiff_t cbpos2	= formatinfo.cbpos[2];
+		const ptrdiff_t cbpos3	= formatinfo.cbpos[3];
+
+		yrow	= (char *)yrow - ystep * ((w-1) >> 2);
+		crrow	= (char *)crrow - cstep * ((w-1) >> 2);
+		cbrow	= (char *)cbrow - cstep * ((w-1) >> 2);
+		dst		= (char *)dst + 3*((w-1) & ~3);
+
+		int y = 0;
+		do {
+			const uint8 *ysrc	= (const uint8 *)yrow;
+			const uint8 *crsrc	= (const uint8 *)crrow;
+			const uint8 *cbsrc	= (const uint8 *)cbrow;
+			uint8 *out = (uint8 *)dst;
+			int w2 = -w;
+
+			switch(w2 & 3) {
+				do {
+			case 0:	ycbcr_to_888(out+9, ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+			case 1:	ycbcr_to_888(out+6, ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+			case 2:	ycbcr_to_888(out+3, ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+			case 3:	ycbcr_to_888(out, ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+					out -= 12;
+					ysrc += ystep;
+					crsrc += cstep;
+					cbsrc += cstep;
+				} while((w2 += 4) < 0);
+			}
+
+			dst		= (char *)dst + dststride;
+			yrow	= (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+			cbrow	= (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+			crrow	= (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+		} while(++y < h);
+	}
+
+	void VDYCbCrToRGB8888Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+		const ptrdiff_t ystep	= formatinfo.ystep;
+		const ptrdiff_t cstep	= formatinfo.cstep;
+		const ptrdiff_t ypos0	= formatinfo.ypos[0];
+		const ptrdiff_t ypos1	= formatinfo.ypos[1];
+		const ptrdiff_t ypos2	= formatinfo.ypos[2];
+		const ptrdiff_t ypos3	= formatinfo.ypos[3];
+		const ptrdiff_t crpos0	= formatinfo.crpos[0];
+		const ptrdiff_t crpos1	= formatinfo.crpos[1];
+		const ptrdiff_t crpos2	= formatinfo.crpos[2];
+		const ptrdiff_t crpos3	= formatinfo.crpos[3];
+		const ptrdiff_t cbpos0	= formatinfo.cbpos[0];
+		const ptrdiff_t cbpos1	= formatinfo.cbpos[1];
+		const ptrdiff_t cbpos2	= formatinfo.cbpos[2];
+		const ptrdiff_t cbpos3	= formatinfo.cbpos[3];
+
+		yrow	= (char *)yrow - ystep * ((w-1) >> 2);
+		crrow	= (char *)crrow - cstep * ((w-1) >> 2);
+		cbrow	= (char *)cbrow - cstep * ((w-1) >> 2);
+		dst		= (char *)dst + 4*((w-1) & ~3);
+
+		int y = 0;
+		do {
+			const uint8 *ysrc	= (const uint8 *)yrow;
+			const uint8 *crsrc	= (const uint8 *)crrow;
+			const uint8 *cbsrc	= (const uint8 *)cbrow;
+			uint32 *out = (uint32 *)dst;
+			int w2 = -w;
+
+			switch(w2 & 3) {
+				do {
+			case 0:	out[3] = ycbcr_to_8888(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+			case 1:	out[2] = ycbcr_to_8888(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+			case 2:	out[1] = ycbcr_to_8888(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+			case 3:	out[0] = ycbcr_to_8888(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+					out -= 4;
+					ysrc += ystep;
+					crsrc += cstep;
+					cbsrc += cstep;
+				} while((w2 += 4) < 0);
+			}
+
+			dst		= (char *)dst + dststride;
+			yrow	= (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+			cbrow	= (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+			crrow	= (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+		} while(++y < h);
+	}
+}
+
+#define DECLARE_YUV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+DECLARE_YUV(UYVY, XRGB1555) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint16 *dst = (uint16 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint16 *y;
+
+		cb = src[0];
+		cr = src[2];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab15[277 + colorconv.y_tab[src[1]]];
+		*dst++ = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[4];
+				cr = src[6];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab15[277 + colorconv.y_tab[src[3]]];
+				dst[0] = (y[(rc0+rc1+1)>>1] & 0x7c00) + (y[(gc0+gc1+1)>>1] & 0x3e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+				y = &colorconv.cliptab15[277 + colorconv.y_tab[src[5]]];
+				dst[1] = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+				dst += 2;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab15[277 + colorconv.y_tab[src[3]]];
+			*dst = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(UYVY, RGB565) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint16 *dst = (uint16 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint16 *y;
+
+		cb = src[0];
+		cr = src[2];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab16[277 + colorconv.y_tab[src[1]]];
+		*dst++ = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[4];
+				cr = src[6];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab16[277 + colorconv.y_tab[src[3]]];
+				dst[0] = (y[(rc0+rc1+1)>>1] & 0xf800) + (y[(gc0+gc1+1)>>1] & 0x7e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+				y = &colorconv.cliptab16[277 + colorconv.y_tab[src[5]]];
+				dst[1] = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+				dst += 2;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab16[277 + colorconv.y_tab[src[3]]];
+			*dst = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(UYVY, RGB888) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint8 *dst = (uint8 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint8 *y;
+
+		cb = src[0];
+		cr = src[2];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab[277 + colorconv.y_tab[src[1]]];
+		dst[0] = y[bc1];
+		dst[1] = y[gc1];
+		dst[2] = y[rc1];
+		dst += 3;
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[4];
+				cr = src[6];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+				dst[0] = y[(bc0+bc1+1)>>1];
+				dst[1] = y[(gc0+gc1+1)>>1];
+				dst[2] = y[(rc0+rc1+1)>>1];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[5]]];
+				dst[3] = y[bc1];
+				dst[4] = y[gc1];
+				dst[5] = y[rc1];
+
+				dst += 6;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+			dst[0] = y[bc1];
+			dst[1] = y[gc1];
+			dst[2] = y[rc1];
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(UYVY, XRGB8888) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint8 *dst = (uint8 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint8 *y;
+
+		cb = src[0];
+		cr = src[2];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab[277 + colorconv.y_tab[src[1]]];
+		dst[0] = y[bc1];
+		dst[1] = y[gc1];
+		dst[2] = y[rc1];
+		dst += 4;
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[4];
+				cr = src[6];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+				dst[0] = y[(bc0+bc1+1)>>1];
+				dst[1] = y[(gc0+gc1+1)>>1];
+				dst[2] = y[(rc0+rc1+1)>>1];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[5]]];
+				dst[4] = y[bc1];
+				dst[5] = y[gc1];
+				dst[6] = y[rc1];
+
+				dst += 8;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+			dst[0] = y[bc1];
+			dst[1] = y[gc1];
+			dst[2] = y[rc1];
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(YUYV, XRGB1555) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint16 *dst = (uint16 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint16 *y;
+
+		cb = src[1];
+		cr = src[3];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab15[277 + colorconv.y_tab[src[0]]];
+		*dst++ = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[5];
+				cr = src[7];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab15[277 + colorconv.y_tab[src[2]]];
+				dst[0] = (y[(rc0+rc1+1)>>1] & 0x7c00) + (y[(gc0+gc1+1)>>1] & 0x3e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+				y = &colorconv.cliptab15[277 + colorconv.y_tab[src[4]]];
+				dst[1] = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+				dst += 2;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab15[277 + colorconv.y_tab[src[2]]];
+			*dst = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(YUYV, RGB565) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint16 *dst = (uint16 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint16 *y;
+
+		cb = src[1];
+		cr = src[3];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab16[277 + colorconv.y_tab[src[0]]];
+		*dst++ = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[5];
+				cr = src[7];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab16[277 + colorconv.y_tab[src[2]]];
+				dst[0] = (y[(rc0+rc1+1)>>1] & 0xf800) + (y[(gc0+gc1+1)>>1] & 0x7e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+				y = &colorconv.cliptab16[277 + colorconv.y_tab[src[4]]];
+				dst[1] = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+				dst += 2;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab16[277 + colorconv.y_tab[src[2]]];
+			*dst = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(YUYV, RGB888) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint8 *dst = (uint8 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint8 *y;
+
+		cb = src[1];
+		cr = src[3];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab[277 + colorconv.y_tab[src[0]]];
+		dst[0] = y[bc1];
+		dst[1] = y[gc1];
+		dst[2] = y[rc1];
+		dst += 3;
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[5];
+				cr = src[7];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+				dst[0] = y[(bc0+bc1+1)>>1];
+				dst[1] = y[(gc0+gc1+1)>>1];
+				dst[2] = y[(rc0+rc1+1)>>1];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[4]]];
+				dst[3] = y[bc1];
+				dst[4] = y[gc1];
+				dst[5] = y[rc1];
+
+				dst += 6;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+			dst[0] = y[bc1];
+			dst[1] = y[gc1];
+			dst[2] = y[rc1];
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(YUYV, XRGB8888) {
+	do {
+		const uint8 *src = (const uint8 *)src0;
+		uint8 *dst = (uint8 *)dst0;
+
+		// convert first pixel
+		int cb, cr;
+		int rc0, gc0, bc0, rc1, gc1, bc1;
+		const uint8 *y;
+
+		cb = src[1];
+		cr = src[3];
+		rc1 = colorconv.r_cr_tab[cr];
+		gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+		bc1 = colorconv.b_cb_tab[cb];
+
+		y = &colorconv.cliptab[277 + colorconv.y_tab[src[0]]];
+		dst[0] = y[bc1];
+		dst[1] = y[gc1];
+		dst[2] = y[rc1];
+		dst += 4;
+
+		// convert pairs of pixels
+		int w2 = w;
+
+		if ((w2 -= 2) > 0) {
+			do {
+				rc0 = rc1;
+				gc0 = gc1;
+				bc0 = bc1;
+
+				cb = src[5];
+				cr = src[7];
+				rc1 = colorconv.r_cr_tab[cr];
+				gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+				bc1 = colorconv.b_cb_tab[cb];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+				dst[0] = y[(bc0+bc1+1)>>1];
+				dst[1] = y[(gc0+gc1+1)>>1];
+				dst[2] = y[(rc0+rc1+1)>>1];
+
+				y = &colorconv.cliptab[277 + colorconv.y_tab[src[4]]];
+				dst[4] = y[bc1];
+				dst[5] = y[gc1];
+				dst[6] = y[rc1];
+
+				dst += 8;
+				src += 4;
+			} while((w2 -= 2) > 0);
+		}
+
+		// handle oddballs
+		if (!(w2 & 1)) {
+			y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+			dst[0] = y[bc1];
+			dst[1] = y[gc1];
+			dst[2] = y[rc1];
+		}
+
+		vdptrstep(src0, srcpitch);
+		vdptrstep(dst0, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, XRGB1555) {
+	uint16 *dst = (uint16 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 2*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			*dst++ = colorconv.cliptab15[colorconv.y_tab[*src++] + 277];
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, RGB565) {
+	uint16 *dst = (uint16 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 2*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			*dst++ = colorconv.cliptab16[colorconv.y_tab[*src++] + 277];
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, RGB888) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 3*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			dst[0] = dst[1] = dst[2] = colorconv.cliptab[colorconv.y_tab[*src++] + 277];
+			dst += 3;
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, XRGB8888) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 4*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			*dst++ = 0x010101 * colorconv.cliptab[colorconv.y_tab[*src++] + 277];
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+#define DECLARE_YUV_PLANAR(x, y) void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+
+namespace {
+	typedef void (*tpYUVPlanarFinalDecoder)(void *, const uint8 *, const uint8 *, const uint8 *, uint32);
+	typedef void (*tpYUVPlanarHorizDecoder)(uint8 *dst, const uint8 *src, sint32 w);
+	typedef void (*tpYUVPlanarVertDecoder)(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase);
+}
+
+#ifdef _M_IX86
+	extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+	extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+	extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+#endif
+
+
+void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+	int hbits = srcinfo.auxwbits;
+	int vbits = srcinfo.auxhbits;
+
+	if (src.format == nsVDPixmap::kPixFormat_YUV422_UYVY || src.format == nsVDPixmap::kPixFormat_YUV422_YUYV)
+		hbits = 1;
+
+	bool h_coaligned = true;
+	bool v_coaligned = false;
+
+	if (src.format == nsVDPixmap::kPixFormat_YUV422_Planar_Centered ||
+		src.format == nsVDPixmap::kPixFormat_YUV420_Planar_Centered) {
+		h_coaligned = false;
+	}
+
+	tpYUVPlanarVertDecoder vfunc = NULL;
+	tpYUVPlanarHorizDecoder hfunc = NULL;
+	uint32 horiz_buffer_size = 0;
+	uint32 vert_buffer_size = 0;
+	uint32 horiz_count = 0;
+	sint32 yaccum = 8;
+	sint32 yinc = 8;
+	uint32 yleft = h;
+
+	switch(vbits*2+v_coaligned) {
+	case 0:		// 4:4:4, 4:2:2
+	case 1:
+		break;
+	case 2:		// 4:2:0 (centered) 
+		vfunc = vert_expand2x_centered;
+		vert_buffer_size = w>>1;
+		yaccum = 6;
+		yinc = 4;
+		yleft >>= 1;
+		break;
+	case 4:		// 4:1:0 (centered)
+		vfunc = vert_expand4x_centered;
+		vert_buffer_size = w>>2;
+		yaccum = 5;
+		yinc = 2;
+		yleft >>= 2;
+		break;
+	default:
+		VDNEVERHERE;
+		return;
+	}
+
+	--yleft;
+
+	tpYUVPlanarFinalDecoder dfunc = NULL;
+
+#ifdef _M_IX86
+	uint32 cpuflags = CPUGetEnabledExtensions();
+
+	if (cpuflags & CPUF_SUPPORTS_MMX) {
+		if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+			if (vfunc == vert_expand2x_centered)
+				vfunc = vert_expand2x_centered_ISSE;
+		}
+
+		switch(dst.format) {
+		case nsVDPixmap::kPixFormat_XRGB1555:	dfunc = vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX;	break;
+		case nsVDPixmap::kPixFormat_RGB565:		dfunc = vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX;	break;
+		case nsVDPixmap::kPixFormat_XRGB8888:	dfunc = vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX;	break;
+		}
+	}
+#endif
+
+	bool halfchroma = false;
+
+	if (!dfunc) {
+		switch(dst.format) {
+		case nsVDPixmap::kPixFormat_XRGB1555:		dfunc = VDYCbCrToXRGB1555Span;	break;
+		case nsVDPixmap::kPixFormat_RGB565:			dfunc = VDYCbCrToRGB565Span;	break;
+		case nsVDPixmap::kPixFormat_RGB888:			dfunc = VDYCbCrToRGB888Span;	break;
+		case nsVDPixmap::kPixFormat_XRGB8888:		dfunc = VDYCbCrToXRGB8888Span;	break;
+		case nsVDPixmap::kPixFormat_YUV422_UYVY:	dfunc = VDYCbCrToUYVYSpan;		halfchroma = true;	break;
+		case nsVDPixmap::kPixFormat_YUV422_YUYV:	dfunc = VDYCbCrToYUYVSpan;		halfchroma = true;	break;
+		default:
+			VDNEVERHERE;
+			return;
+		}
+	}
+
+	switch(hbits*2+h_coaligned) {
+	case 0:		// 4:4:4
+	case 1:
+		if (halfchroma) {
+			hfunc = horiz_compress2x_coaligned;
+			horiz_buffer_size = (w + 1) >> 1;
+			horiz_count = w;
+		}
+		break;
+	case 2:		// 4:2:0 MPEG-1 (centered)
+		if (halfchroma) {
+			hfunc = horiz_realign_to_coaligned;
+			horiz_buffer_size = (w + 1) >> 1;
+			horiz_count = (w + 1) >> 1;
+		} else {
+			hfunc = horiz_expand2x_centered;
+			horiz_buffer_size = w;
+			horiz_count = w;
+		}
+		break;
+	case 3:		// 4:2:0/4:2:2 MPEG-2 (coaligned)
+		if (!halfchroma) {
+			hfunc = horiz_expand2x_coaligned;
+			horiz_buffer_size = w;
+			horiz_count = w;
+		}
+		break;
+	case 5:		// 4:1:1 (coaligned)
+		if (halfchroma) {
+			hfunc = horiz_expand2x_coaligned;
+			horiz_buffer_size = (w + 1) >> 1;
+			horiz_count = (w + 1) >> 1;
+		} else {
+			hfunc = horiz_expand4x_coaligned;
+			horiz_buffer_size = w;
+			horiz_count = w;
+		}
+		break;
+
+	default:
+		VDNEVERHERE;
+		return;
+	}
+
+#ifdef _M_IX86
+	if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+		if (hfunc == horiz_expand2x_coaligned)
+			hfunc = horiz_expand2x_coaligned_ISSE;
+	}
+#endif
+
+	uint32 chroma_srcwidth = -(-w >> srcinfo.auxwbits);
+	horiz_buffer_size = (horiz_buffer_size + 15) & ~15;
+	vert_buffer_size = (vert_buffer_size + 15) & ~15;
+
+	// allocate buffers
+
+	vdblock<uint8> tempbuf((horiz_buffer_size + vert_buffer_size)*2 + 1);
+
+	uint8 *const crbufh = tempbuf.data();
+	uint8 *const crbufv = crbufh + horiz_buffer_size;
+	uint8 *const cbbufh = crbufv + vert_buffer_size;
+	uint8 *const cbbufv = cbbufh + horiz_buffer_size;
+
+	const uint8 *cb0 = (const uint8*)src.data2;
+	const uint8 *cr0 = (const uint8*)src.data3;
+	const uint8 *cb1  = cb0;
+	const uint8 *cr1  = cr0;
+	const uint8 *y = (const uint8 *)src.data;
+	const ptrdiff_t ypitch = src.pitch;
+	const ptrdiff_t cbpitch = src.pitch2;
+	const ptrdiff_t crpitch = src.pitch3;
+
+	void *out = dst.data;
+	ptrdiff_t outpitch = dst.pitch;
+
+	for(;;) {
+		if (yaccum >= 8) {
+			yaccum &= 7;
+
+			cb0 = cb1;
+			cr0 = cr1;
+
+			if (yleft > 0) {
+				--yleft;
+				vdptrstep(cb1, cbpitch);
+				vdptrstep(cr1, crpitch);
+			}
+		}
+
+		const uint8 *cr = cr0;
+		const uint8 *cb = cb0;
+
+		// vertical interpolation: cr
+		if(yaccum & 7) {
+			const uint8 *const srcs[2]={cr0, cr1};
+			vfunc(crbufv, srcs, chroma_srcwidth, (yaccum & 7) << 5);
+			cr = crbufv;
+		}
+
+		// horizontal interpolation: cr
+		if (hfunc) {
+			hfunc(crbufh, cr, horiz_count);
+			cr = crbufh;
+		}
+
+		// vertical interpolation: cb
+		if(yaccum & 7) {
+			const uint8 *const srcs[2]={cb0, cb1};
+			vfunc(cbbufv, srcs, chroma_srcwidth, (yaccum & 7) << 5);
+			cb = cbbufv;
+		}
+
+		// horizontal interpolation: cb
+		if (hfunc) {
+			hfunc(cbbufh, cb, horiz_count);
+			cb = cbbufh;
+		}
+
+		dfunc(out, y, cb, cr, w);
+		vdptrstep(out, outpitch);
+		vdptrstep(y, ypitch);
+
+		if (!--h)
+			break;
+
+		yaccum += yinc;
+	}
+
+#ifdef _M_IX86
+	if (cpuflags & CPUF_SUPPORTS_MMX) {
+		__asm emms
+	}
+#endif
+}
+
+namespace {
+	typedef void (*tpUVBltHorizDecoder)(uint8 *dst, const uint8 *src, sint32 w);
+	typedef void (*tpUVBltVertDecoder)(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase);
+
+	void uvplaneblt(uint8 *dst, ptrdiff_t dstpitch, int dstformat, const uint8 *src, ptrdiff_t srcpitch, int srcformat, vdpixsize w, vdpixsize h) {
+		const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(srcformat);
+		const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dstformat);
+
+		int xshift = srcinfo.auxwbits - dstinfo.auxwbits;
+		int yshift = srcinfo.auxhbits - dstinfo.auxhbits;
+
+		tpUVBltHorizDecoder		hfunc = NULL;
+		tpUVBltVertDecoder		vfunc = NULL;
+
+		switch(xshift) {
+		case +2:
+			hfunc = horiz_expand4x_coaligned;
+			break;
+		case +1:
+			hfunc = horiz_expand2x_coaligned;
+			break;
+		case  0:
+			break;
+		case -1:
+			hfunc = horiz_compress2x_coaligned;
+			break;
+		case -2:
+			hfunc = horiz_compress4x_coaligned;
+			break;
+		default:
+			VDNEVERHERE;
+			return;
+		}
+
+#ifdef _M_IX86
+		uint32 cpuflags = CPUGetEnabledExtensions();
+
+		if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+			if (hfunc == horiz_expand2x_coaligned)
+				hfunc = horiz_expand2x_coaligned_ISSE;
+		}
+#endif
+
+		int winsize, winposnext, winstep;
+
+		switch(yshift) {
+		case +2:
+			vfunc = vert_expand4x_centered;
+			winsize = 2;
+			winposnext = 0xa0;
+			winstep = 0x40;
+			break;
+		case +1:
+			vfunc = vert_expand2x_centered;
+			winsize = 2;
+			winposnext = 0xc0;
+			winstep = 0x80;
+			break;
+		case  0:
+			winsize = 1;
+			winposnext = 0;
+			winstep = 0x100;
+			break;
+		case -1:
+			vfunc = vert_compress2x_centered;
+			winsize = 4;
+			winposnext = 0x200;
+			winstep = 0x200;
+			break;
+		case -2:
+			vfunc = vert_compress4x_centered;
+			winsize = 8;
+			winposnext = 0x500;
+			winstep = 0x400;
+			break;
+		default:
+			VDNEVERHERE;
+			return;
+		}
+
+#ifdef _M_IX86
+		if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+			if (vfunc == vert_expand2x_centered)
+				vfunc = vert_expand2x_centered_ISSE;
+		}
+#endif
+
+		int dsth = -(-h >> dstinfo.auxhbits);
+		int srch = -(-h >> srcinfo.auxhbits);
+		int dstw = -(-w >> dstinfo.auxwbits);
+		int w2 = -(-w >> std::min<int>(dstinfo.auxwbits, srcinfo.auxwbits));
+
+		int winpos = (winposnext>>8) - winsize;
+
+		const uint8 *window[16];
+
+		vdblock<uint8> tmpbuf;
+		ptrdiff_t tmppitch = (w+15) & ~15;
+
+		if (vfunc && hfunc)
+			tmpbuf.resize(tmppitch * winsize);
+
+		do {
+			int desiredpos = winposnext >> 8;
+
+			while(winpos < desiredpos) {
+				const uint8 *srcrow = vdptroffset(src, srcpitch * std::max<int>(0, std::min<int>(srch-1, ++winpos)));
+				int winoffset = (winpos-1) & (winsize-1);
+
+				if (hfunc) {
+					uint8 *dstrow = vfunc ? tmpbuf.data() + tmppitch * winoffset : dst;
+					hfunc(dstrow, srcrow, w2);
+					srcrow = dstrow;
+				}
+
+				window[winoffset] = window[winoffset + winsize] = srcrow;
+			}
+
+			if (vfunc)
+				vfunc(dst, window + (winpos & (winsize-1)), dstw, winposnext & 255);
+			else if (!hfunc)
+				memcpy(dst, window[winpos & (winsize-1)], dstw);
+
+			winposnext += winstep;
+			vdptrstep(dst, dstpitch);
+		} while(--dsth);
+
+#ifdef _M_IX86
+		if (cpuflags & CPUF_SUPPORTS_MMX) {
+			__asm emms
+		}
+#endif
+	}
+}
+
+void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dstpm, const VDPixmap& srcpm, vdpixsize w, vdpixsize h) {
+	VDMemcpyRect(dstpm.data, dstpm.pitch, srcpm.data, srcpm.pitch, dstpm.w, dstpm.h);
+
+	if (srcpm.format != nsVDPixmap::kPixFormat_Y8) {
+		if (dstpm.format != nsVDPixmap::kPixFormat_Y8) {
+			// YCbCr -> YCbCr
+			uvplaneblt((uint8 *)dstpm.data2, dstpm.pitch2, dstpm.format, (uint8 *)srcpm.data2, srcpm.pitch2, srcpm.format, w, h);
+			uvplaneblt((uint8 *)dstpm.data3, dstpm.pitch3, dstpm.format, (uint8 *)srcpm.data3, srcpm.pitch3, srcpm.format, w, h);
+		}
+	} else {
+		if (dstpm.format != nsVDPixmap::kPixFormat_Y8) {
+			const VDPixmapFormatInfo& info = VDPixmapGetInfo(dstpm.format);
+			VDMemset8Rect(dstpm.data2, dstpm.pitch2, 0x80, -(-w >> info.auxwbits), -(-h >> info.auxhbits));
+			VDMemset8Rect(dstpm.data3, dstpm.pitch3, 0x80, -(-w >> info.auxwbits), -(-h >> info.auxhbits));
+		}
+	}
+}
+
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555) {
+	uint16			*out	= (uint16 *)dst.data;
+	const ptrdiff_t	opitch	= dst.pitch;
+	const uint8		*yrow	= (const uint8 *)src.data;
+	const uint8		*cbrow	= (const uint8 *)src.data2;
+	const uint8		*crrow	= (const uint8 *)src.data3;
+	const ptrdiff_t	ypitch	= src.pitch;
+	const ptrdiff_t	cbpitch	= src.pitch2;
+	const ptrdiff_t	crpitch	= src.pitch3;
+
+	vdpixsize wpairs = (w-1)>>2;
+	vdpixsize wleft = w - (wpairs<<2);
+
+	do {
+		uint16 *p = out;
+		const uint8 *y = yrow;
+		const uint8 *cb = cbrow;
+		const uint8 *cr = crrow;
+		vdpixsize wt;
+
+		if (wpairs > 0) {
+#ifdef _M_AMD64
+			wt = wpairs;
+
+			do {
+				const unsigned cb0 = cb[0];
+				const unsigned cb1 = cb[1];
+				const unsigned cr0 = cr[0];
+				const unsigned cr1 = cr[1];
+
+				p[0] = ycbcr_to_1555(y[0], cb0, cr0);
+				p[1] = ycbcr_to_1555(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+				p[2] = ycbcr_to_1555(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+				p[3] = ycbcr_to_1555(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+				y += 4;
+				p += 4;
+				++cb;
+				++cr;
+			} while(--wt);
+#else
+			vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE(p, y, cb, cr, wpairs);
+			y += 4*wpairs;
+			cr += wpairs;
+			cb += wpairs;
+			p += 4*wpairs;
+#endif
+		}
+
+		if (wleft > 0) {
+			wt = wleft;
+
+			const uint8 cr0 = *cr;
+			const uint8 cb0 = *cb;
+
+			do {
+				*p++ = ycbcr_to_1555(*y++, cb0, cr0);
+			} while(--wt);
+		}
+
+		vdptrstep(out, opitch);
+		vdptrstep(yrow, ypitch);
+		vdptrstep(cbrow, cbpitch);
+		vdptrstep(crrow, crpitch);
+	} while(--h);
+
+#ifndef _M_AMD64
+	__asm emms
+#endif
+}
+
+DECLARE_YUV_PLANAR(YUV411, RGB565) {
+	uint16			*out	= (uint16 *)dst.data;
+	const ptrdiff_t	opitch	= dst.pitch;
+	const uint8		*yrow	= (const uint8 *)src.data;
+	const uint8		*cbrow	= (const uint8 *)src.data2;
+	const uint8		*crrow	= (const uint8 *)src.data3;
+	const ptrdiff_t	ypitch	= src.pitch;
+	const ptrdiff_t	cbpitch	= src.pitch2;
+	const ptrdiff_t	crpitch	= src.pitch3;
+
+	vdpixsize wpairs = (w-1)>>2;
+	vdpixsize wleft = w - (wpairs<<2);
+
+	do {
+		uint16 *p = out;
+		const uint8 *y = yrow;
+		const uint8 *cb = cbrow;
+		const uint8 *cr = crrow;
+		vdpixsize wt;
+
+		if (wpairs > 0) {
+#if _M_AMD64
+			wt = wpairs;
+
+			do {
+				const unsigned cb0 = cb[0];
+				const unsigned cb1 = cb[1];
+				const unsigned cr0 = cr[0];
+				const unsigned cr1 = cr[1];
+
+				p[0] = ycbcr_to_565(y[0], cb0, cr0);
+				p[1] = ycbcr_to_565(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+				p[2] = ycbcr_to_565(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+				p[3] = ycbcr_to_565(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+				y += 4;
+				p += 4;
+				++cb;
+				++cr;
+			} while(--wt);
+#else
+			vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE(p, y, cb, cr, wpairs);
+#endif
+		}
+
+		if (wleft > 0) {
+			wt = wleft;
+
+			const uint8 cr0 = *cr;
+			const uint8 cb0 = *cb;
+
+			do {
+				*p++ = ycbcr_to_565(*y++, cb0, cr0);
+			} while(--wt);
+		}
+
+		vdptrstep(out, opitch);
+		vdptrstep(yrow, ypitch);
+		vdptrstep(cbrow, cbpitch);
+		vdptrstep(crrow, crpitch);
+	} while(--h);
+
+#ifndef _M_AMD64
+	__asm emms
+#endif
+}
+
+DECLARE_YUV_PLANAR(YUV411, RGB888) {
+	uint8			*out	= (uint8 *)dst.data;
+	const ptrdiff_t	opitch	= dst.pitch;
+	const uint8		*yrow	= (const uint8 *)src.data;
+	const uint8		*cbrow	= (const uint8 *)src.data2;
+	const uint8		*crrow	= (const uint8 *)src.data3;
+	const ptrdiff_t	ypitch	= src.pitch;
+	const ptrdiff_t	cbpitch	= src.pitch2;
+	const ptrdiff_t	crpitch	= src.pitch3;
+
+	vdpixsize wpairs = (w-1)>>2;
+	vdpixsize wleft = w - (wpairs<<2);
+
+	do {
+		uint8 *p = out;
+		const uint8 *y = yrow;
+		const uint8 *cb = cbrow;
+		const uint8 *cr = crrow;
+		vdpixsize wt;
+
+		if (wpairs > 0) {
+			wt = wpairs;
+
+			do {
+				const unsigned cb0 = cb[0];
+				const unsigned cb1 = cb[1];
+				const unsigned cr0 = cr[0];
+				const unsigned cr1 = cr[1];
+
+				ycbcr_to_888(p+0, y[0], cb0, cr0);
+				ycbcr_to_888(p+3, y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+				ycbcr_to_888(p+6, y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+				ycbcr_to_888(p+9, y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+				y += 4;
+				p += 12;
+				++cb;
+				++cr;
+			} while(--wt);
+		}
+
+		if (wleft > 0) {
+			wt = wleft;
+
+			const uint8 cr0 = *cr;
+			const uint8 cb0 = *cb;
+
+			do {
+				ycbcr_to_888(p, *y++, cb0, cr0);
+				p += 4;
+			} while(--wt);
+		}
+
+		vdptrstep(out, opitch);
+		vdptrstep(yrow, ypitch);
+		vdptrstep(cbrow, cbpitch);
+		vdptrstep(crrow, crpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_PLANAR(YUV411, XRGB8888) {
+	uint32			*out	= (uint32 *)dst.data;
+	const ptrdiff_t	opitch	= dst.pitch;
+	const uint8		*yrow	= (const uint8 *)src.data;
+	const uint8		*cbrow	= (const uint8 *)src.data2;
+	const uint8		*crrow	= (const uint8 *)src.data3;
+	const ptrdiff_t	ypitch	= src.pitch;
+	const ptrdiff_t	cbpitch	= src.pitch2;
+	const ptrdiff_t	crpitch	= src.pitch3;
+
+	vdpixsize wpairs = (w-1)>>2;
+	vdpixsize wleft = w - (wpairs<<2);
+
+	do {
+		uint32 *p = out;
+		const uint8 *y = yrow;
+		const uint8 *cb = cbrow;
+		const uint8 *cr = crrow;
+		vdpixsize wt;
+
+		if (wpairs > 0) {
+#ifdef _M_AMD64
+			wt = wpairs;
+
+			do {
+				const unsigned cb0 = cb[0];
+				const unsigned cb1 = cb[1];
+				const unsigned cr0 = cr[0];
+				const unsigned cr1 = cr[1];
+
+				p[0] = ycbcr_to_8888(y[0], cb0, cr0);
+				p[1] = ycbcr_to_8888(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+				p[2] = ycbcr_to_8888(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+				p[3] = ycbcr_to_8888(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+				y += 4;
+				p += 4;
+				++cb;
+				++cr;
+			} while(--wt);
+#else
+			vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX(p, y, cb, cr, wpairs);
+			y += 4*wpairs;
+			cr += wpairs;
+			cb += wpairs;
+			p += 4*wpairs;
+#endif
+		}
+
+		if (wleft > 0) {
+			wt = wleft;
+
+			const uint8 cr0 = *cr;
+			const uint8 cb0 = *cb;
+
+			do {
+				*p++ = ycbcr_to_8888(*y++, cb0, cr0);
+			} while(--wt);
+		}
+
+		vdptrstep(out, opitch);
+		vdptrstep(yrow, ypitch);
+		vdptrstep(cbrow, cbpitch);
+		vdptrstep(crrow, crpitch);
+	} while(--h);
+
+#ifndef _M_AMD64
+	__asm emms
+#endif
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp
new file mode 100644
index 000000000..b581e9bf7
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp
@@ -0,0 +1,260 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#include "bitutils.h"
+#include "blt_spanutils.h"
+
+#define DECLARE_YUV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+using namespace nsVDPixmapBitUtils;
+using namespace nsVDPixmapSpanUtils;
+
+DECLARE_YUV(XVYU, UYVY) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+
+	srcpitch -= (w&~1)*4;
+	dstpitch -= (w&~1)*2;
+
+	do {
+		vdpixsize wt = w;
+
+		wt = -wt;
+
+		if (++wt) {
+			uint32 a, b, c;
+
+			a = src[0];
+			b = src[1];
+			*dst++ = (avg_8888_121(a, a, b) & 0xff00ff) + (a & 0xff00) + ((b & 0xff00)<<16);
+			src += 2;
+
+			if ((wt+=2) < 0) {
+				do {
+					a = src[-1];
+					b = src[0];
+					c = src[1];
+
+					*dst++ = (avg_8888_121(a, b, c) & 0xff00ff) + (b & 0xff00) + ((c & 0xff00)<<16);
+					src += 2;
+				} while((wt+=2) < 0);
+			}
+		}
+
+		if (!(wt&1))
+			*dst = *src;
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(XVYU, YUYV) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+
+	srcpitch -= (w&~1)*4;
+	dstpitch -= (w&~1)*2;
+
+	do {
+		vdpixsize wt = w;
+
+		wt = -wt;
+
+		if (++wt) {
+			uint32 a, b, c;
+
+			a = src[0];
+			b = src[1];
+			*dst++ = ((avg_8888_121(a, a, b) & 0xff00ff)<<8) + ((a & 0xff00)>>8) + ((b & 0xff00)<<8);
+			src += 2;
+
+			if ((wt+=2)<0) {
+				do {
+					a = src[-1];
+					b = src[0];
+					c = src[1];
+
+					*dst++ = ((avg_8888_121(a, b, c) & 0xff00ff)<<8) + ((b & 0xff00)>>8) + ((c & 0xff00)<<8);
+					src += 2;
+				} while((wt+=2) < 0);
+			}
+		}
+
+		if (!(wt&1)) {
+			uint32 v = *src;
+			*dst = ((v&0xff00ff)<<8) + ((v&0xff00ff00)>>8);
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(UYVY, YUYV) {			// also YUYV->UYVY
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+
+	w = (w+1) >> 1;
+
+	dstpitch -= 4*w;
+	srcpitch -= 4*w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			const uint32 p = *src++;
+
+			*dst++ = ((p & 0xff00ff00)>>8) + ((p & 0x00ff00ff)<<8);
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(UYVY, Y8) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= w;
+	srcpitch -= 2*w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			*dst++ = src[1];
+			src += 2;
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(YUYV, Y8) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= w;
+	srcpitch -= 2*w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			*dst++ = src[0];
+			src += 2;
+		} while(--w2);
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, UYVY) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 2*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			dst[0] = 0x80;
+			dst[1] = *src++;
+			dst += 2;
+		} while(--w2);
+
+		if (w & 1) {
+			dst[0] = 0x80;
+			dst[1] = dst[-1];
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV(Y8, YUYV) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	dstpitch -= 2*w;
+	srcpitch -= w;
+
+	do {
+		vdpixsize w2 = w;
+
+		do {
+			dst[0] = *src++;
+			dst[1] = 0x80;
+			dst += 2;
+		} while(--w2);
+
+		if (w & 1) {
+			dst[0] = dst[-1];
+			dst[1] = 0x80;
+		}
+
+		vdptrstep(src, srcpitch);
+		vdptrstep(dst, dstpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_PLANAR(YUV411, YV12) {
+	VDMemcpyRect(dst.data, dst.pitch, src.data, src.pitch, w, h);
+
+	vdblock<uint8> tmprow(w);	
+	const uint8 *srcp = (const uint8 *)src.data2;
+	ptrdiff_t srcpitch = src.pitch2;
+	uint8 *dstp = (uint8 *)dst.data2;
+	ptrdiff_t dstpitch = dst.pitch2;
+	const uint8 *src1, *src2;
+
+	vdpixsize h2;
+	for(h2 = h; h2 > 0; h2 -= 2) {
+		src1 = srcp;
+		vdptrstep(srcp, srcpitch);
+		if (h2 > 1)
+			src2 = srcp;
+		else
+			src2 = src1;
+		vdptrstep(srcp, srcpitch);
+
+		const uint8 *sources[2] = {src1, src2};
+
+		vert_compress2x_centered_fast(tmprow.data(), sources, w, 0);
+		horiz_expand2x_coaligned(dstp, tmprow.data(), w);
+
+		vdptrstep(dstp, dstpitch);
+	}
+
+	srcp = (const uint8 *)src.data3;
+	srcpitch = src.pitch3;
+	dstp = (uint8 *)dst.data3;
+	dstpitch = dst.pitch3;
+	for(h2 = h; h2 > 0; h2 -= 2) {
+		src1 = srcp;
+		vdptrstep(srcp, srcpitch);
+		if (h2 > 1)
+			src2 = srcp;
+		else
+			src2 = src1;
+		vdptrstep(srcp, srcpitch);
+
+		const uint8 *sources[2] = {src1, src2};
+		vert_compress2x_centered_fast(tmprow.data(), sources, w, 0);
+		horiz_expand2x_coaligned(dstp, tmprow.data(), w);
+
+		vdptrstep(dstp, dstpitch);
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp
new file mode 100644
index 000000000..d6f38bf65
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp
@@ -0,0 +1,530 @@
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_spanutils.h"
+
+#ifdef _M_IX86
+	#include "blt_spanutils_x86.h"
+#endif
+
+using namespace nsVDPixmapSpanUtils;
+
+namespace {
+	// From Jim Blinn's "Dirty Pixels":
+	//
+	// Y  = .299R + .587G + .114B
+	// Cr = 0.713(R-Y)
+	// Cb = 0.564(B-Y)
+	//
+	// IY  = 219Y  + 16  = ((yt = 1052IR + 2065IG + 401IB) + 67584) >> 12
+	// ICr = 224Cr + 128 = (yt*2987 - 10507932IR + 2155872256) >> 24
+	// ICb = 224Cb + 128 = (yt*2363 - 8312025IB + 2155872256) >> 24
+
+	void ConvertRGB32ToXVYU32(uint32 *dst, const uint8 *src, sint32 count) {
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			const sint32 yt = 1052*r + 2065*g + 401*b;
+			const sint32 y  = (yt + 67584) >> 4;							// <<8 alignment shift
+			const sint32 cr = (10507932*r - yt*2987 + 2155872256U) >> 8;	// <<16 alignment shift
+			const sint32 cb = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+
+			*dst++ = (y&0xff00) + cb + (cr&0xff0000);		// VYU order
+			src += 4;
+		} while(--count);
+	}
+
+	void ConvertRGB24ToXVYU32(uint32 *dst, const uint8 *src, sint32 count) {
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			const sint32 yt = 1052*r + 2065*g + 401*b;
+			const sint32 y  = (yt + 67584) >> 4;							// <<8 alignment shift
+			const sint32 cr = (10507932*r - yt*2987 + 2155872256U) >> 8;	// <<16 alignment shift
+			const sint32 cb = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+
+			*dst++ = (y&0xff00) + cb + (cr&0xff0000);		// VYU order
+			src += 3;
+		} while(--count);
+	}
+
+	void ConvertRGB16ToXVYU32(uint32 *dst, const uint16 *src, sint32 count) {
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0xf800) >> 11;
+			const sint32 g  = (px & 0x07e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			const sint32 yt = 8652*r + 8358*g + 3299*b;
+			const sint32 y  = (yt + 67584) >> 4;							// <<8 alignment shift
+			const sint32 cr = (86436217*r - yt*2987 + 2155872256U) >> 8;
+			const sint32 cb = (68373108*b - yt*2363 + 2155872256U) >> 24;	// <<16 alignment shift
+
+			*dst++ = (y&0xff00) + cb + (cr&0xff0000);		// VYU order
+		} while(--count);
+	}
+
+	void ConvertRGB15ToXVYU32(uint32 *dst, const uint16 *src, sint32 count) {
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0x7c00) >> 10;
+			const sint32 g  = (px & 0x03e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			const sint32 yt = 8652*r + 16986*g + 3299*b;
+			const sint32 y  = (yt + 67584) >> 4;							// <<8 alignment shift
+			const sint32 cr = (86436217*r - yt*2987 + 2155872256U) >> 8;	// <<16 alignment shift
+			const sint32 cb = (68373108*b - yt*2363 + 2155872256U) >> 24;
+
+			*dst++ = (y&0xff00) + cb + (cr&0xff0000);		// VYU order
+		} while(--count);
+	}
+
+	void ConvertRGB32ToY8(uint8 *dst, const uint8 *src, sint32 count) {
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			*dst++ = (uint8)((1052*r + 2065*g + 401*b + 67584) >> 12);
+			src += 4;
+		} while(--count);
+	}
+
+	void ConvertRGB24ToY8(uint8 *dst, const uint8 *src, sint32 count) {
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			*dst++ = (uint8)((1052*r + 2065*g + 401*b + 67584) >> 12);
+			src += 3;
+		} while(--count);
+	}
+
+	void ConvertRGB16ToY8(uint8 *dst, const uint16 *src, sint32 count) {
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0xf800) >> 11;
+			const sint32 g  = (px & 0x07e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			*dst++ = (uint8)((8652*r + 8358*g + 3299*b + 67584) >> 12);
+		} while(--count);
+	}
+
+	void ConvertRGB15ToY8(uint8 *dst, const uint16 *src, sint32 count) {
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0x7c00) >> 10;
+			const sint32 g  = (px & 0x03e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			*dst++ = (uint8)((8652*r + 16986*g + 3299*b + 67584) >> 12);
+		} while(--count);
+	}
+}
+
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+DECLARE_YUV_REV(XRGB1555, XVYU) {
+	do {
+		ConvertRGB15ToXVYU32((uint32 *)dst0, (const uint16 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(RGB565, XVYU) {
+	do {
+		ConvertRGB16ToXVYU32((uint32 *)dst0, (const uint16 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(RGB888, XVYU) {
+	do {
+		ConvertRGB24ToXVYU32((uint32 *)dst0, (const uint8 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(XRGB8888, XVYU) {
+	do {
+		ConvertRGB32ToXVYU32((uint32 *)dst0, (const uint8 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(XRGB1555, Y8) {
+	do {
+		ConvertRGB15ToY8((uint8 *)dst0, (const uint16 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(RGB565, Y8) {
+	do {
+		ConvertRGB16ToY8((uint8 *)dst0, (const uint16 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(RGB888, Y8) {
+	do {
+		ConvertRGB24ToY8((uint8 *)dst0, (const uint8 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+DECLARE_YUV_REV(XRGB8888, Y8) {
+	do {
+		ConvertRGB32ToY8((uint8 *)dst0, (const uint8 *)src0, w);
+
+		vdptrstep(dst0, dstpitch);
+		vdptrstep(src0, srcpitch);
+	} while(--h);
+}
+
+
+
+
+
+namespace {
+	void ConvertRGB32ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint8 *src = (const uint8 *)src0;
+
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			const sint32 yt = 1052*r + 2065*g + 401*b;
+			*ydst++  = (yt + 67584) >> 12;
+			*crdst++ = (10507932*r - yt*2987 + 2155872256U) >> 24;
+			*cbdst++ = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+			src += 4;
+		} while(--count);
+	}
+
+	void ConvertRGB24ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint8 *src = (const uint8 *)src0;
+
+		do {
+			const sint32 r  = src[2];
+			const sint32 g  = src[1];
+			const sint32 b  = src[0];
+			const sint32 yt = 1052*r + 2065*g + 401*b;
+			*ydst++  = (yt + 67584) >> 12;
+			*crdst++ = (10507932*r - yt*2987 + 2155872256U) >> 24;
+			*cbdst++ = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+			src += 3;
+		} while(--count);
+	}
+
+	void ConvertRGB16ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint16 *src = (const uint16 *)src0;
+
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0xf800) >> 11;
+			const sint32 g  = (px & 0x07e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			const sint32 yt = 8652*r + 8358*g + 3299*b;
+			*ydst++  = (yt + 67584) >> 12;
+			*crdst++ = (86436217*r - yt*2987 + 2155872256U) >> 24;
+			*cbdst++ = (68373108*b - yt*2363 + 2155872256U) >> 24;
+		} while(--count);
+	}
+
+	void ConvertRGB15ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint16 *src = (const uint16 *)src0;
+
+		do {
+			const sint16 px = *src++;
+			const sint32 r  = (px & 0x7c00) >> 10;
+			const sint32 g  = (px & 0x03e0) >> 5;
+			const sint32 b  = (px & 0x001f);
+			const sint32 yt = 8652*r + 16986*g + 3299*b;
+			*ydst++  = (yt + 67584) >> 12;
+			*crdst++ = (86436217*r - yt*2987 + 2155872256U) >> 24;
+			*cbdst++ = (68373108*b - yt*2363 + 2155872256U) >> 24;
+		} while(--count);
+	}
+
+	void ConvertUYVYToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint8 *src = (const uint8 *)src0;
+
+		do {
+			*cbdst++ = src[0];
+			*ydst++ = src[1];
+			*crdst++ = src[2];
+			if (!--count)
+				break;
+			*ydst++ = src[3];
+			src += 4;
+		} while(--count);
+	}
+
+	void ConvertYUYVToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+		const uint8 *src = (const uint8 *)src0;
+
+		do {
+			*cbdst++ = src[1];
+			*ydst++ = src[0];
+			*crdst++ = src[3];
+			if (!--count)
+				break;
+			*ydst++ = src[2];
+			src += 4;
+		} while(--count);
+	}
+}
+
+void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dstbm, const VDPixmap& srcbm, vdpixsize w, vdpixsize h) {
+	void (*cfunc)(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src, sint32 w) = NULL;
+	void (*hfunc)(uint8 *dst, const uint8 *src, sint32 w) = NULL;
+	void (*vfunc)(uint8 *dst, const uint8 *const *sources, sint32 w, uint8 phase) = NULL;
+
+	bool halfchroma = false;
+
+	switch(srcbm.format) {
+	case nsVDPixmap::kPixFormat_XRGB1555:
+		cfunc = ConvertRGB15ToYUVPlanar;
+		break;
+	case nsVDPixmap::kPixFormat_RGB565:
+		cfunc = ConvertRGB16ToYUVPlanar;
+		break;
+	case nsVDPixmap::kPixFormat_RGB888:
+		cfunc = ConvertRGB24ToYUVPlanar;
+		break;
+	case nsVDPixmap::kPixFormat_XRGB8888:
+		cfunc = ConvertRGB32ToYUVPlanar;
+		break;
+	case nsVDPixmap::kPixFormat_YUV422_UYVY:
+		cfunc = ConvertUYVYToYUVPlanar;
+		halfchroma = true;
+		break;
+	case nsVDPixmap::kPixFormat_YUV422_YUYV:
+		cfunc = ConvertYUYVToYUVPlanar;
+		halfchroma = true;
+		break;
+	default:
+		VDNEVERHERE;
+		return;
+	}
+
+	vdpixsize w2 = w;
+	vdpixsize h2 = h;
+	int winstep = 1;
+	int winsize = 1;
+	int winposnext = 0;
+	vdpixsize chroma_srcw = w;
+
+	switch(dstbm.format) {
+
+	case nsVDPixmap::kPixFormat_YUV444_Planar:
+		if (halfchroma)
+			hfunc = horiz_expand2x_coaligned;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV422_Planar:
+		if (halfchroma)
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+		else
+			hfunc = horiz_compress2x_coaligned;
+
+		w2 = (w2+1) >> 1;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV422_Planar_Centered:
+		if (halfchroma) {
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+			hfunc = horiz_realign_to_centered;
+		} else
+			hfunc = horiz_compress2x_centered;
+
+		w2 = (w2+1) >> 1;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV420_Planar:
+		if (halfchroma)
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+		else
+			hfunc = horiz_compress2x_coaligned;
+
+		vfunc = vert_compress2x_centered;
+		winstep = 2;
+		winposnext = 2;
+		winsize = 4;
+		h2 = (h+1) >> 1;
+		w2 = (w2+1) >> 1;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV420_Planar_Centered:
+		if (halfchroma) {
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+			hfunc = horiz_realign_to_centered;
+		} else
+			hfunc = horiz_compress2x_centered;
+
+		vfunc = vert_compress2x_centered;
+		winstep = 2;
+		winposnext = 2;
+		winsize = 4;
+		h2 = (h+1) >> 1;
+		w2 = (w2+1) >> 1;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV411_Planar:
+		if (halfchroma) {
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+			hfunc = horiz_compress2x_coaligned;
+		} else
+			hfunc = horiz_compress4x_coaligned;
+		w2 = (w2+1) >> 2;
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV410_Planar:
+		if (halfchroma) {
+			chroma_srcw = (chroma_srcw + 1) >> 1;
+			hfunc = horiz_compress2x_coaligned;
+		} else
+			hfunc = horiz_compress4x_coaligned;
+		vfunc = vert_compress4x_centered;
+		winsize = 8;
+		winposnext = 5;
+		winstep = 4;
+		h2 = (h+3) >> 2;
+		w2 = (w2+3) >> 2;
+		break;
+	}
+
+#ifdef _M_IX86
+	uint32 cpuflags = CPUGetEnabledExtensions();
+
+	if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+		if (hfunc == horiz_expand2x_coaligned)
+			hfunc = horiz_expand2x_coaligned_ISSE;
+	}
+#endif
+
+	const uint8 *src = (const uint8 *)srcbm.data;
+	const ptrdiff_t srcpitch = srcbm.pitch;
+
+	uint8 *ydst = (uint8 *)dstbm.data;
+	uint8 *cbdst = (uint8 *)dstbm.data2;
+	uint8 *crdst = (uint8 *)dstbm.data3;
+	const ptrdiff_t ydstpitch = dstbm.pitch;
+	const ptrdiff_t cbdstpitch = dstbm.pitch2;
+	const ptrdiff_t crdstpitch = dstbm.pitch3;
+
+	if (!vfunc) {
+		if (hfunc) {
+			uint32 tmpsize = (w + 15) & ~15;
+
+			vdblock<uint8> tmp(tmpsize * 2);
+			uint8 *const cbtmp = tmp.data();
+			uint8 *const crtmp = cbtmp + tmpsize;
+
+			do {
+				cfunc(ydst, cbtmp, crtmp, src, w);
+				src += srcpitch;
+				ydst += ydstpitch;
+				hfunc(cbdst, cbtmp, chroma_srcw);
+				hfunc(crdst, crtmp, chroma_srcw);
+				cbdst += cbdstpitch;
+				crdst += crdstpitch;
+			} while(--h);
+		} else if (dstbm.format == nsVDPixmap::kPixFormat_Y8) {
+			// wasteful, but oh well
+			uint32 tmpsize = (w2+15)&~15;
+			vdblock<uint8> tmp(tmpsize);
+
+			cbdst = tmp.data();
+			crdst = cbdst + tmpsize;
+
+			do {
+				cfunc(ydst, cbdst, crdst, src, w);
+				src += srcpitch;
+				ydst += ydstpitch;
+			} while(--h2);
+		} else {
+			do {
+				cfunc(ydst, cbdst, crdst, src, w);
+				src += srcpitch;
+				ydst += ydstpitch;
+				cbdst += cbdstpitch;
+				crdst += crdstpitch;
+			} while(--h2);
+		}
+	} else {
+		const uint32 tmpsize = w2;
+
+		vdblock<uint8>		tmpbuf(tmpsize * (winsize + 1) * 2 + 2 * w);
+
+		uint8 *cbwindow[16];
+		uint8 *crwindow[16];
+
+		uint8 *p = tmpbuf.data();
+		for(int i=0; i<winsize; ++i) {
+			cbwindow[i] = cbwindow[winsize+i] = p;
+			p += tmpsize;
+			crwindow[i] = crwindow[winsize+i] = p;
+			p += tmpsize;
+		}
+
+		uint8 *cbtmp = p;
+		uint8 *crtmp = p + w;
+
+		int winoffset;
+		int winpos = winposnext - winsize;
+		bool firstline = true;
+
+		do {
+			while(winpos < winposnext) {
+				winoffset = ++winpos & (winsize - 1);
+
+				bool valid = (unsigned)(winpos-1) < (unsigned)(h-1);		// -1 because we generate line 0 as the first window line
+				if (valid || firstline) {
+					if (hfunc) {
+						cfunc(ydst, cbtmp, crtmp, src, w);
+						hfunc(cbwindow[winoffset + winsize - 1], cbtmp, chroma_srcw);
+						hfunc(crwindow[winoffset + winsize - 1], crtmp, chroma_srcw);
+					} else {
+						cfunc(ydst, cbwindow[winoffset + winsize - 1], crwindow[winoffset + winsize - 1], src, w);
+					}
+					src += srcpitch;
+					ydst += ydstpitch;
+					firstline = false;
+				} else {
+					// dupe last generated line -- could be done by pointer swabbing, but I'm lazy
+					memcpy(cbwindow[winoffset + winsize - 1], cbwindow[winoffset + winsize - 2], w2);
+					memcpy(crwindow[winoffset + winsize - 1], crwindow[winoffset + winsize - 2], w2);
+				}
+			}
+			winposnext += winstep;
+
+			vfunc(cbdst, cbwindow + winoffset, w2, 0);
+			vfunc(crdst, crwindow + winoffset, w2, 0);
+			cbdst += cbdstpitch;
+			crdst += crdstpitch;
+		} while(--h2);
+	}
+
+#ifdef _M_IX86
+	if (cpuflags & CPUF_SUPPORTS_MMX) {
+		__asm emms
+	}
+#endif
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp
new file mode 100644
index 000000000..ce999221a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp
@@ -0,0 +1,17 @@
+#include "blt_setup.h"
+
+void VDPixmapBlitterTable::Clear() {
+	memset(mTable, 0, sizeof mTable);
+}
+
+void VDPixmapBlitterTable::AddBlitter(const VDPixmapFormatSubset& srcFormats, VDPixmapFormatSubset& dstFormats, VDPixmapBlitterFn blitter) {
+	for(int i=0; i<srcFormats.mFormatCount; ++i) {
+		int srcFormat = srcFormats.mFormats[i];
+		for(int j=0; j<dstFormats.mFormatCount; ++j) {
+			int dstFormat = dstFormats.mFormats[j];
+
+			if (srcFormat != dstFormat)
+				mTable[srcFormat][dstFormat] = blitter;
+		}
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp
new file mode 100644
index 000000000..6baeeca36
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp
@@ -0,0 +1,365 @@
+#include "blt_spanutils.h"
+#include "bitutils.h"
+
+using namespace nsVDPixmapBitUtils;
+
+namespace nsVDPixmapSpanUtils {
+	void horiz_expand2x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+		w = -w;
+
+		*dst++ = *src;
+
+		if (++w) {
+			if (++w) {
+				do {
+					dst[0] = (uint8)((3*src[0] + src[1] + 2)>>2);
+					dst[1] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+					dst += 2;
+					++src;
+				} while((w+=2)<0);
+			}
+
+			if (!(w & 1)) {
+				*dst = src[0];
+			}
+		}
+	}
+
+	void horiz_expand2x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+		w = -w;
+
+		if ((w+=2) < 0) {
+			do {
+				dst[0] = src[0];
+				dst[1] = (uint8)((src[0] + src[1] + 1)>>1);
+				dst += 2;
+				++src;
+			} while((w+=2)<0);
+		}
+
+		w -= 2;
+		while(w < 0) {
+			++w;
+			*dst++ = src[0];
+		}
+	}
+
+	void horiz_expand4x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+		w = -w;
+
+		if ((w+=4) < 0) {
+			do {
+				dst[0] = src[0];
+				dst[1] = (uint8)((3*src[0] + src[1] + 2)>>2);
+				dst[2] = (uint8)((src[0] + src[1] + 1)>>1);
+				dst[3] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+				dst += 4;
+				++src;
+			} while((w+=4)<0);
+		}
+
+		w -= 4;
+		while(w < 0) {
+			++w;
+			*dst++ = src[0];
+		}
+	}
+
+	void horiz_compress2x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+		if (w == 1) {
+			*dst = *src;
+			return;
+		}
+
+		*dst++ = (uint8)((3*src[0] + src[1] + 2) >> 2);
+		++src;
+		--w;
+
+		while(w >= 3) {
+			w -= 2;
+			*dst++ = (uint8)((src[0] + 2*src[1] + src[2] + 2) >> 2);
+			src += 2;
+		}
+
+		if (w >= 2)
+			*dst++ = (uint8)((src[0] + 3*src[1] + 2) >> 2);
+	}
+
+	void horiz_compress2x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+		if (w == 1) {
+			*dst = *src;
+			return;
+		}
+
+		if (w == 2) {
+			*dst = (uint8)((src[0] + src[1] + 1) >> 1);
+			return;
+		}
+
+		*dst++ = (uint8)((4*src[0] + 3*src[1] + src[2] + 4) >> 3);
+		--w;
+		++src;
+
+		while(w >= 4) {
+			w -= 2;
+			*dst++ = (uint8)(((src[0] + src[3]) + 3*(src[1] + src[2]) + 4) >> 3);
+			src += 2;
+		}
+
+		switch(w) {
+		case 3:
+			*dst++ = (uint8)((src[0] + 3*src[1] + 4*src[2] + 4) >> 3);
+			break;
+		case 2:
+			*dst++ = (uint8)((src[0] + 7*src[1] + 4) >> 3);
+			break;
+		}
+	}
+
+	void horiz_compress4x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+		if (w == 1) {
+			*dst = *src;
+			return;
+		}
+
+		if (w == 2) {
+			*dst++ = (uint8)((11*src[0] + 5*src[1] + 8) >> 4);
+			return;
+		}
+
+		*dst++ = (uint8)((11*src[0] + 4*src[1] + src[2] + 8) >> 4);
+		src += 2;
+		w -= 2;
+
+		while(w >= 5) {
+			w -= 4;
+			*dst++ = (uint8)(((src[0] + src[4]) + 4*(src[1] + src[3]) + 6*src[2] + 8) >> 4);
+			src += 4;
+		}
+
+		switch(w) {
+		case 4:
+			*dst = (uint8)((src[0] + 4*src[1] + 6*src[2] + 5*src[3] + 8) >> 4);
+			break;
+		case 3:
+			*dst = (uint8)((src[0] + 4*src[1] + 11*src[2] + 8) >> 4);
+			break;
+		}
+	}
+
+	void horiz_compress4x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+
+		switch(w) {
+		case 1:
+			*dst = *src;
+			return;
+		case 2:		// 29 99
+			*dst = (uint8)((29*src[0] + 99*src[1] + 64) >> 7);
+			return;
+		case 3:		// 29 35 64
+			*dst = (uint8)((29*src[0] + 35*src[1] + 64*src[1] + 64) >> 7);
+			return;
+		case 4:		// 29 35 35 29
+			*dst = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 29*src[3] + 64) >> 7);
+			return;
+		case 5:		// 29 35 35 21 8
+					//        1 7 120
+			dst[0] = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 21*src[3] + 8*src[4] + 64) >> 7);
+			dst[1] = (uint8)((src[2] + 7*src[3] + 120*src[4] + 64) >> 7);
+			return;
+		}
+
+		*dst++ = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 21*src[3] + 7*src[4] + src[5] + 64) >> 7);
+		src += 2;
+		w -= 2;
+
+		while(w >= 8) {
+			w -= 4;
+			*dst++ = (uint8)(((src[0] + src[7]) + 7*(src[1] + src[6]) + 21*(src[2] + src[5]) + 35*(src[3] + src[4]) + 64) >> 7);
+			src += 4;
+		}
+
+		switch(w) {
+		case 4:		// 1 7 21 99
+			*dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 99*src[3] + 64) >> 7);
+			break;
+		case 5:		// 1 7 21 35 64
+			*dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 35*src[3] + 64*src[4] + 64) >> 7);
+			break;
+		case 6:		// 1 7 21 35 35 29
+			*dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 29*src[5] + 35*(src[3] + src[4]) + 64) >> 7);
+			break;
+		case 7:		// 1 7 21 35 35 21 8
+					//            1 7 120
+			dst[0] = (uint8)((src[0] + 7*src[1] + 8*src[6] + 21*(src[2] + src[5]) + 35*(src[3] + src[4]) + 64) >> 7);
+			dst[1] = (uint8)((src[4] + 7*src[5] + 120*src[6] + 64) >> 7);
+			break;
+		}
+	}
+
+	void horiz_realign_to_centered(uint8 *dst, const uint8 *src, sint32 w) {
+		// luma samples:	Y		Y		Y		Y		Y
+		// coaligned:		C				C				C
+		// centered:			C				C
+		//
+		// To realign coaligned samples to centered, we need to shift them
+		// right by a quarter sample in chroma space. This can be done via
+		// a [3 1]/4 filter.
+
+		for(sint32 i=1; i<w; ++i) {
+			dst[0] = (uint8)((3*(uint32)src[0] + (uint32)src[1] + 2) >> 2);
+			++dst;
+			++src;
+		}
+
+		*dst++ = *src++;
+	}
+
+	void horiz_realign_to_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+		// luma samples:	Y		Y		Y		Y		Y
+		// coaligned:		C				C				C
+		// centered:			C				C
+		//
+		// To realign centered samples to coaligned, we need to shift them
+		// left by a quarter sample in chroma space. This can be done via
+		// a [1 3]/4 filter.
+
+		*dst++ = *src++;
+
+		for(sint32 i=1; i<w; ++i) {
+			dst[0] = (uint8)(((uint32)src[-1] + 3*(uint32)src[0] + 2) >> 2);
+			++dst;
+			++src;
+		}
+	}
+
+	void vert_expand2x_centered(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+		const uint8 *src3 = srcs[0];
+		const uint8 *src1 = srcs[1];
+
+		if (phase >= 128)
+			std::swap(src1, src3);
+
+		sint32 w4 = w>>2;
+		w &= 3;
+
+		if (w4) {
+			const uint32 *src34 = (const uint32 *)src3;
+			const uint32 *src14 = (const uint32 *)src1;
+			      uint32 *dst4  = (      uint32 *)dst;
+
+			do {
+				const uint32 a = *src34++;
+				const uint32 b = *src14++;
+				const uint32 ab = (a&b) + (((a^b)&0xfefefefe)>>1);
+
+				*dst4++ = (a|ab) - (((a^ab)&0xfefefefe)>>1);
+			} while(--w4);
+
+			src3 = (const uint8 *)src34;
+			src1 = (const uint8 *)src14;
+			dst  = (      uint8 *)dst4;
+		}
+
+		if (w) {
+			do {
+				*dst++ = (uint8)((*src1++ + 3**src3++ + 2) >> 2);
+			} while(--w);
+		}
+	}
+
+	void vert_expand4x_centered(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+		const uint8 *src3 = srcs[0];
+		const uint8 *src1 = srcs[1];
+
+		switch(phase & 0xc0) {
+		case 0x00:
+			do {
+				*dst++ = (uint8)((1**src1++ + 7**src3++ + 4) >> 3);
+			} while(--w);
+			break;
+		case 0x40:
+			do {
+				*dst++ = (uint8)((3**src1++ + 5**src3++ + 4) >> 3);
+			} while(--w);
+			break;
+		case 0x80:
+			do {
+				*dst++ = (uint8)((5**src1++ + 3**src3++ + 4) >> 3);
+			} while(--w);
+			break;
+		case 0xc0:
+			do {
+				*dst++ = (uint8)((7**src1++ + 1**src3++ + 4) >> 3);
+			} while(--w);
+			break;
+		default:
+			VDNEVERHERE;
+		}
+	}
+
+	void vert_compress2x_centered_fast(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+		const uint8 *src1 = srcarray[0];
+		const uint8 *src2 = srcarray[1];
+
+		w = -w;
+		w += 3;
+
+		while(w < 0) {
+			*(uint32 *)dst = avg_8888_11(*(uint32 *)src1, *(uint32 *)src2);
+			dst += 4;
+			src1 += 4;
+			src2 += 4;
+			w += 4;
+		}
+
+		w -= 3;
+
+		while(w < 0) {
+			*dst = (uint8)((*src1 + *src2 + 1)>>1);
+			++dst;
+			++src1;
+			++src2;
+			++w;
+		}
+	}
+
+	void vert_compress2x_centered(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+		const uint8 *src1 = srcarray[0];
+		const uint8 *src2 = srcarray[1];
+		const uint8 *src3 = srcarray[2];
+		const uint8 *src4 = srcarray[3];
+
+		w = -w;
+
+		while(w < 0) {
+			*dst++ = (uint8)(((*src1++ + *src4++) + 3*(*src2++ + *src3++) + 4)>>3);
+			++w;
+		}
+	}
+
+	void vert_compress4x_centered(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+		const uint8 *src1 = srcarray[0];
+		const uint8 *src2 = srcarray[1];
+		const uint8 *src3 = srcarray[2];
+		const uint8 *src4 = srcarray[3];
+		const uint8 *src5 = srcarray[4];
+		const uint8 *src6 = srcarray[5];
+		const uint8 *src7 = srcarray[6];
+		const uint8 *src8 = srcarray[7];
+
+		w = -w;
+
+		while(w < 0) {
+			int sum18 = *src1++ + *src8++;
+			int sum27 = *src2++ + *src7++;
+			int sum36 = *src3++ + *src6++;
+			int sum45 = *src4++ + *src5++;
+
+			*dst++ = (uint8)((sum18 + 7*sum27 + 21*sum36 + 35*sum45 + 64) >> 7);
+
+			++w;
+		}
+	}
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp
new file mode 100644
index 000000000..ea9e0599a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp
@@ -0,0 +1,170 @@
+//	VirtualDub - Video processing and capture application
+//	Graphics support library
+//	Copyright (C) 1998-2007 Avery Lee
+//
+//	This program is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program; if not, write to the Free Software
+//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include "blt_spanutils_x86.h"
+
+#ifdef _MSC_VER
+	#pragma warning(disable: 4799)		// warning C4799: function 'nsVDPixmapSpanUtils::vdasm_horiz_expand2x_coaligned_ISSE' has no EMMS instruction
+#endif
+
+extern "C" void __cdecl vdasm_horiz_expand2x_coaligned_ISSE(void *dst, const void *src, uint32 count);
+extern "C" void __cdecl vdasm_horiz_expand4x_coaligned_MMX(void *dst, const void *src, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_13_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_17_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_35_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+
+namespace nsVDPixmapSpanUtils {
+
+	void horiz_expand2x_coaligned_ISSE(uint8 *dst, const uint8 *src, sint32 w) {
+		if (w >= 17) {
+			uint32 fastcount = (w - 1) & ~15;
+
+			vdasm_horiz_expand2x_coaligned_ISSE(dst, src, fastcount);
+			dst += fastcount;
+			src += fastcount >> 1;
+			w -= fastcount;
+		}
+
+		w = -w;
+		if ((w+=2) < 0) {
+			do {
+				dst[0] = src[0];
+				dst[1] = (uint8)((src[0] + src[1] + 1)>>1);
+				dst += 2;
+				++src;
+			} while((w+=2)<0);
+		}
+
+		w -= 2;
+		while(w < 0) {
+			++w;
+			*dst++ = src[0];
+		}
+	}
+
+	void horiz_expand4x_coaligned_MMX(uint8 *dst, const uint8 *src, sint32 w) {
+		if (w >= 17) {
+			uint32 fastcount = (w - 1) >> 4;
+
+			vdasm_horiz_expand4x_coaligned_MMX(dst, src, fastcount);
+			dst += fastcount << 4;
+			src += fastcount << 2;
+			w -= fastcount << 4;
+		}
+
+		w = -w;
+		if ((w+=4) < 0) {
+			do {
+				dst[0] = src[0];
+				dst[1] = (uint8)((3*src[0] + src[1] + 2)>>2);
+				dst[2] = (uint8)((src[0] + src[1] + 1)>>1);
+				dst[3] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+				dst += 4;
+				++src;
+			} while((w+=4)<0);
+		}
+
+		w -= 4;
+		while(w < 0) {
+			++w;
+			*dst++ = src[0];
+		}
+	}
+
+	void vert_expand2x_centered_ISSE(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+		const uint8 *src3 = srcs[0];
+		const uint8 *src1 = srcs[1];
+
+		if (phase >= 128)
+			std::swap(src1, src3);
+
+		uint32 fastcount = w & ~15;
+
+		if (fastcount) {
+			vdasm_vert_average_13_ISSE(dst, src1, src3, fastcount);
+			dst += fastcount;
+			src1 += fastcount;
+			src3 += fastcount;
+			w -= fastcount;
+		}
+
+		if (w) {
+			do {
+				*dst++ = (uint8)((*src1++ + 3**src3++ + 2) >> 2);
+			} while(--w);
+		}
+	}
+
+	void vert_average_1_7_ISSE(uint8 *dst, const uint8 *src7, const uint8 *src1, sint32 w) {
+		uint32 fastcount = w & ~7;
+
+		if (fastcount) {
+			vdasm_vert_average_17_ISSE(dst, src1, src7, fastcount);
+			dst += fastcount;
+			src1 += fastcount;
+			src7 += fastcount;
+			w -= fastcount;
+		}
+
+		if (w) {
+			do {
+				*dst++ = (uint8)((*src1++ + 7**src7++ + 4) >> 3);
+			} while(--w);
+		}
+	}
+
+	void vert_average_3_5_ISSE(uint8 *dst, const uint8 *src7, const uint8 *src1, sint32 w) {
+		uint32 fastcount = w & ~7;
+
+		if (fastcount) {
+			vdasm_vert_average_35_ISSE(dst, src1, src7, fastcount);
+			dst += fastcount;
+			src1 += fastcount;
+			src7 += fastcount;
+			w -= fastcount;
+		}
+
+		if (w) {
+			do {
+				*dst++ = (uint8)((3**src1++ + 5**src7++ + 4) >> 3);
+			} while(--w);
+		}
+	}
+
+	void vert_expand4x_centered_ISSE(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+		const uint8 *src1 = srcs[0];
+		const uint8 *src2 = srcs[1];
+
+		switch(phase & 0xc0) {
+		case 0x00:
+			vert_average_1_7_ISSE(dst, src2, src1, w);
+			break;
+		case 0x40:
+			vert_average_3_5_ISSE(dst, src2, src1, w);
+			break;
+		case 0x80:
+			vert_average_3_5_ISSE(dst, src1, src2, w);
+			break;
+		case 0xc0:
+			vert_average_1_7_ISSE(dst, src1, src2, w);
+			break;
+		default:
+			VDNEVERHERE;
+		}
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp
new file mode 100644
index 000000000..dcaa20907
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp
@@ -0,0 +1,19 @@
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmap.h>
+#include "uberblit.h"
+
+void VDPixmapBlt_UberblitAdapter(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+	vdautoptr<IVDPixmapBlitter> blitter(VDPixmapCreateBlitter(dst, src));
+
+	if (w > src.w)
+		w = src.w;
+	if (w > dst.w)
+		w = dst.w;
+	if (h > src.h)
+		h = src.h;
+	if (h > dst.h)
+		h = dst.h;
+
+	vdrect32 r(0, 0, w, h);
+	blitter->Blit(dst, &r, src);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp
new file mode 100644
index 000000000..af1519c5b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp
@@ -0,0 +1,144 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_setup.h"
+
+void VDPixmapInitBlittersReference(VDPixmapBlitterTable& table);
+
+#define DECLARE_PALETTED(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0);
+#define DECLARE_RGB(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_RGB_ASM(x, y) extern "C" void vdasm_pixblt_##x##_to_##y(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_RGB_ASM_MMX(x, y) extern "C" void vdasm_pixblt_##x##_to_##y##_MMX(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_YUV(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+									DECLARE_RGB_ASM(RGB565,	  XRGB1555);	DECLARE_RGB_ASM_MMX(RGB565,   XRGB1555);
+									DECLARE_RGB_ASM(RGB888,   XRGB1555);
+									DECLARE_RGB_ASM(XRGB8888, XRGB1555);	DECLARE_RGB_ASM_MMX(XRGB8888, XRGB1555);
+									DECLARE_RGB_ASM(XRGB1555, RGB565);		DECLARE_RGB_ASM_MMX(XRGB1555, RGB565);
+									DECLARE_RGB_ASM(RGB888,   RGB565);
+									DECLARE_RGB_ASM(XRGB8888, RGB565);		DECLARE_RGB_ASM_MMX(XRGB8888, RGB565);
+DECLARE_RGB(XRGB1555, RGB888);
+DECLARE_RGB(RGB565,   RGB888);
+									DECLARE_RGB_ASM(XRGB8888, RGB888);		DECLARE_RGB_ASM_MMX(XRGB8888, RGB888);
+									DECLARE_RGB_ASM(XRGB1555, XRGB8888);	DECLARE_RGB_ASM_MMX(XRGB1555, XRGB8888);
+									DECLARE_RGB_ASM(RGB565,   XRGB8888);	DECLARE_RGB_ASM_MMX(RGB565,   XRGB8888);
+									DECLARE_RGB_ASM(RGB888,   XRGB8888);	DECLARE_RGB_ASM_MMX(RGB888,   XRGB8888);
+
+DECLARE_PALETTED(Pal1, Any8);
+DECLARE_PALETTED(Pal1, Any16);
+DECLARE_PALETTED(Pal1, Any24);
+DECLARE_PALETTED(Pal1, Any32);
+DECLARE_PALETTED(Pal2, Any8);
+DECLARE_PALETTED(Pal2, Any16);
+DECLARE_PALETTED(Pal2, Any24);
+DECLARE_PALETTED(Pal2, Any32);
+DECLARE_PALETTED(Pal4, Any8);
+DECLARE_PALETTED(Pal4, Any16);
+DECLARE_PALETTED(Pal4, Any24);
+DECLARE_PALETTED(Pal4, Any32);
+DECLARE_PALETTED(Pal8, Any8);
+DECLARE_PALETTED(Pal8, Any16);
+DECLARE_PALETTED(Pal8, Any24);
+DECLARE_PALETTED(Pal8, Any32);
+
+DECLARE_YUV(XVYU, UYVY);
+DECLARE_YUV(XVYU, YUYV);
+DECLARE_YUV(Y8, UYVY);
+DECLARE_YUV(Y8, YUYV);
+DECLARE_YUV(UYVY, Y8);
+DECLARE_YUV(YUYV, Y8);
+DECLARE_YUV(UYVY, YUYV);
+DECLARE_YUV_PLANAR(YUV411, YV12);
+
+DECLARE_YUV(UYVY, XRGB1555);
+DECLARE_YUV(UYVY, RGB565);
+DECLARE_YUV(UYVY, RGB888);
+DECLARE_YUV(UYVY, XRGB8888);
+DECLARE_YUV(YUYV, XRGB1555);
+DECLARE_YUV(YUYV, RGB565);
+DECLARE_YUV(YUYV, RGB888);
+DECLARE_YUV(YUYV, XRGB8888);
+DECLARE_YUV(Y8, XRGB1555);
+DECLARE_YUV(Y8, RGB565);
+DECLARE_YUV(Y8, RGB888);
+DECLARE_YUV(Y8, XRGB8888);
+
+DECLARE_YUV_REV(XRGB1555, Y8);
+DECLARE_YUV_REV(RGB565,   Y8);
+DECLARE_YUV_REV(RGB888,   Y8);
+DECLARE_YUV_REV(XRGB8888, Y8);
+
+DECLARE_YUV_REV(XRGB1555, XVYU);
+DECLARE_YUV_REV(RGB565,   XVYU);
+DECLARE_YUV_REV(RGB888,   XVYU);
+DECLARE_YUV_REV(XRGB8888, XVYU);
+
+DECLARE_YUV_PLANAR(YV12, XRGB1555);
+DECLARE_YUV_PLANAR(YV12, RGB565);
+DECLARE_YUV_PLANAR(YV12, RGB888);
+DECLARE_YUV_PLANAR(YV12, XRGB8888);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555);
+DECLARE_YUV_PLANAR(YUV411, RGB565);
+DECLARE_YUV_PLANAR(YUV411, RGB888);
+DECLARE_YUV_PLANAR(YUV411, XRGB8888);
+
+extern void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+using namespace nsVDPixmap;
+
+void VDPixmapInitBlittersX86(VDPixmapBlitterTable& table) {
+	VDPixmapInitBlittersReference(table);
+
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_RGB565>);
+	table.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_XRGB8888>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB1555>);
+	table.AddBlitter(kPixFormat_RGB565,		kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB8888>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB1555>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_RGB565>);
+	table.AddBlitter(kPixFormat_RGB888,		kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB8888>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_XRGB1555>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB565>);
+	table.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB888>);
+}
+
+tpVDPixBltTable VDGetPixBltTableX86ScalarInternal() {
+	static VDPixmapBlitterTable sReferenceTable;
+
+	VDPixmapInitBlittersX86(sReferenceTable);
+
+	return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86MMXInternal() {
+	static VDPixmapBlitterTable sReferenceTable;
+
+	VDPixmapInitBlittersX86(sReferenceTable);
+
+	sReferenceTable.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_RGB565_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_XRGB1555,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_XRGB8888_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_RGB565,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB1555_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_RGB565,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB8888_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_RGB888,	kPixFormat_XRGB8888,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB8888_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_XRGB1555,	VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_XRGB1555_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB565,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB565_MMX>);
+	sReferenceTable.AddBlitter(kPixFormat_XRGB8888,	kPixFormat_RGB888,		VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB888_MMX>);
+
+	return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86Scalar() {
+	static tpVDPixBltTable spTable = VDGetPixBltTableX86ScalarInternal();
+
+	return spTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86MMX() {
+	static tpVDPixBltTable spTable = VDGetPixBltTableX86MMXInternal();
+
+	return spTable;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp b/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp
new file mode 100644
index 000000000..45797ca4b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp
@@ -0,0 +1,667 @@
+//	VirtualDub - Video processing and capture application
+//	Graphics support library
+//	Copyright (C) 1998-2007 Avery Lee
+//
+//	This program is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program; if not, write to the Free Software
+//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <vd2/system/math.h>
+#include <vd2/system/halffloat.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixel.h>
+
+uint32 VDPixmapSample(const VDPixmap& px, sint32 x, sint32 y) {
+	if (x >= px.w)
+		x = px.w - 1;
+	if (y >= px.h)
+		y = px.h - 1;
+	if (x < 0)
+		x = 0;
+	if (y < 0)
+		y = 0;
+
+	switch(px.format) {
+	case nsVDPixmap::kPixFormat_Pal1:
+		{
+			uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 3];
+
+			return px.palette[(idx >> (7 - (x & 7))) & 1];
+		}
+
+	case nsVDPixmap::kPixFormat_Pal2:
+		{
+			uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 2];
+
+			return px.palette[(idx >> (6 - (x & 3)*2)) & 3];
+		}
+
+	case nsVDPixmap::kPixFormat_Pal4:
+		{
+			uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 1];
+
+			if (!(x & 1))
+				idx >>= 4;
+
+			return px.palette[idx & 15];
+		}
+
+	case nsVDPixmap::kPixFormat_Pal8:
+		{
+			uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x];
+
+			return px.palette[idx];
+		}
+
+	case nsVDPixmap::kPixFormat_XRGB1555:
+		{
+			uint16 c = ((const uint16 *)((const uint8 *)px.data + px.pitch*y))[x];
+			uint32 r = c & 0x7c00;
+			uint32 g = c & 0x03e0;
+			uint32 b = c & 0x001f;
+			uint32 rgb = (r << 9) + (g << 6) + (b << 3);
+
+			return rgb + ((rgb >> 5) & 0x070707);
+		}
+		break;
+
+	case nsVDPixmap::kPixFormat_RGB565:
+		{
+			uint16 c = ((const uint16 *)((const uint8 *)px.data + px.pitch*y))[x];
+			uint32 r = c & 0xf800;
+			uint32 g = c & 0x07e0;
+			uint32 b = c & 0x001f;
+			uint32 rb = (r << 8) + (b << 3);
+
+			return rb + ((rb >> 5) & 0x070007) + (g << 5) + ((g >> 1) & 0x0300);
+		}
+		break;
+
+	case nsVDPixmap::kPixFormat_RGB888:
+		{
+			const uint8 *src = (const uint8 *)px.data + px.pitch*y + 3*x;
+			uint32 b = src[0];
+			uint32 g = src[1];
+			uint32 r = src[2];
+
+			return (r << 16) + (g << 8) + b;
+		}
+		break;
+
+	case nsVDPixmap::kPixFormat_XRGB8888:
+		return ((const uint32 *)((const uint8 *)px.data + px.pitch*y))[x];
+
+	case nsVDPixmap::kPixFormat_Y8:
+		{
+			uint8 luma = ((const uint8 *)px.data + px.pitch*y)[x];
+
+			return ((luma - 16)*255/219) * 0x010101;
+		}
+		break;
+
+	case nsVDPixmap::kPixFormat_YUV444_Planar:
+		return VDConvertYCbCrToRGB(VDPixmapSample8(px.data, px.pitch, x, y), VDPixmapSample8(px.data2, px.pitch2, x, y), VDPixmapSample8(px.data3, px.pitch3, x, y));
+
+	case nsVDPixmap::kPixFormat_YUV422_Planar:
+		{
+			sint32 u = (x << 7) + 128;
+			sint32 v = (y << 8);
+			uint32 w2 = px.w >> 1;
+			uint32 h2 = px.h;
+
+			return VDConvertYCbCrToRGB(
+						VDPixmapSample8(px.data, px.pitch, x, y),
+						VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+						VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+		}
+
+	case nsVDPixmap::kPixFormat_YUV420_Planar:
+		{
+			sint32 u = (x << 7) + 128;
+			sint32 v = (y << 7);
+			uint32 w2 = px.w >> 1;
+			uint32 h2 = px.h >> 1;
+
+			return VDConvertYCbCrToRGB(
+						VDPixmapSample8(px.data, px.pitch, x, y),
+						VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+						VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+		}
+
+	case nsVDPixmap::kPixFormat_YUV411_Planar:
+		{
+			sint32 u = (x << 6) + 128;
+			sint32 v = (y << 8);
+			uint32 w2 = px.w >> 2;
+			uint32 h2 = px.h;
+
+			return VDConvertYCbCrToRGB(
+						VDPixmapSample8(px.data, px.pitch, x, y),
+						VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+						VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+		}
+
+	case nsVDPixmap::kPixFormat_YUV410_Planar:
+		{
+			sint32 u = (x << 6) + 128;
+			sint32 v = (y << 6);
+			uint32 w2 = px.w >> 2;
+			uint32 h2 = px.h >> 2;
+
+			return VDConvertYCbCrToRGB(
+						VDPixmapSample8(px.data, px.pitch, x, y),
+						VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+						VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+		}
+
+	default:
+		return VDPixmapInterpolateSampleRGB24(px, (x << 8) + 128, (y << 8) + 128);
+	}
+}
+
+uint8 VDPixmapInterpolateSample8(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+	// bias coordinates to integer
+	x_256 -= 128;
+	y_256 -= 128;
+
+	// clamp coordinates
+	x_256 &= ~(x_256 >> 31);
+	y_256 &= ~(y_256 >> 31);
+
+	uint32 w_256 = (w - 1) << 8;
+	uint32 h_256 = (h - 1) << 8;
+	x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+	y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+	const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8);
+	const uint8 *row1 = row0;
+
+	if ((uint32)y_256 < h_256)
+		row1 += pitch;
+
+	ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+	sint32 xoffset = x_256 & 255;
+	sint32 yoffset = y_256 & 255;
+	sint32 p00 = row0[0];
+	sint32 p10 = row0[xstep];
+	sint32 p01 = row1[0];
+	sint32 p11 = row1[xstep];
+	sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+	sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+	sint32 p = ((p0 << 8) + (p1 - p0)*yoffset + 0x8000) >> 16;
+
+	return (uint8)p;
+}
+
+uint32 VDPixmapInterpolateSample8To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+	// bias coordinates to integer
+	x_256 -= 128;
+	y_256 -= 128;
+
+	// clamp coordinates
+	x_256 &= ~(x_256 >> 31);
+	y_256 &= ~(y_256 >> 31);
+
+	uint32 w_256 = (w - 1) << 8;
+	uint32 h_256 = (h - 1) << 8;
+	x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+	y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+	const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8);
+	const uint8 *row1 = row0;
+
+	if ((uint32)y_256 < h_256)
+		row1 += pitch;
+
+	ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+	sint32 xoffset = x_256 & 255;
+	sint32 yoffset = y_256 & 255;
+	sint32 p00 = row0[0];
+	sint32 p10 = row0[xstep];
+	sint32 p01 = row1[0];
+	sint32 p11 = row1[xstep];
+	sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+	sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+	sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+	return p;
+}
+
+uint32 VDPixmapInterpolateSample8x2To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+	// bias coordinates to integer
+	x_256 -= 128;
+	y_256 -= 128;
+
+	// clamp coordinates
+	x_256 &= ~(x_256 >> 31);
+	y_256 &= ~(y_256 >> 31);
+
+	uint32 w_256 = (w - 1) << 8;
+	uint32 h_256 = (h - 1) << 8;
+	x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+	y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+	const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*2;
+	const uint8 *row1 = row0;
+
+	if ((uint32)y_256 < h_256)
+		row1 += pitch;
+
+	ptrdiff_t xstep = (uint32)x_256 < w_256 ? 2 : 0;
+	sint32 xoffset = x_256 & 255;
+	sint32 yoffset = y_256 & 255;
+	sint32 p00 = row0[0];
+	sint32 p10 = row0[xstep];
+	sint32 p01 = row1[0];
+	sint32 p11 = row1[xstep];
+	sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+	sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+	sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+	return p;
+}
+
+uint32 VDPixmapInterpolateSample8x4To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+	// bias coordinates to integer
+	x_256 -= 128;
+	y_256 -= 128;
+
+	// clamp coordinates
+	x_256 &= ~(x_256 >> 31);
+	y_256 &= ~(y_256 >> 31);
+
+	uint32 w_256 = (w - 1) << 8;
+	uint32 h_256 = (h - 1) << 8;
+	x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+	y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+	const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*4;
+	const uint8 *row1 = row0;
+
+	if ((uint32)y_256 < h_256)
+		row1 += pitch;
+
+	ptrdiff_t xstep = (uint32)x_256 < w_256 ? 4 : 0;
+	sint32 xoffset = x_256 & 255;
+	sint32 yoffset = y_256 & 255;
+	sint32 p00 = row0[0];
+	sint32 p10 = row0[xstep];
+	sint32 p01 = row1[0];
+	sint32 p11 = row1[xstep];
+	sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+	sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+	sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+	return p;
+}
+
+float VDPixmapInterpolateSample16F(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+	// bias coordinates to integer
+	x_256 -= 128;
+	y_256 -= 128;
+
+	// clamp coordinates
+	x_256 &= ~(x_256 >> 31);
+	y_256 &= ~(y_256 >> 31);
+
+	uint32 w_256 = (w - 1) << 8;
+	uint32 h_256 = (h - 1) << 8;
+	x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+	y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+	const uint16 *row0 = (const uint16 *)((const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*2);
+	const uint16 *row1 = row0;
+
+	if ((uint32)y_256 < h_256)
+		row1 = (const uint16 *)((const char *)row1 + pitch);
+
+	ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+	float xoffset = (float)(x_256 & 255) * (1.0f / 255.0f);
+	float yoffset = (float)(y_256 & 255) * (1.0f / 255.0f);
+
+	float p00;
+	float p10;
+	float p01;
+	float p11;
+	VDConvertHalfToFloat(row0[0], &p00);
+	VDConvertHalfToFloat(row0[xstep], &p10);
+	VDConvertHalfToFloat(row1[0], &p01);
+	VDConvertHalfToFloat(row1[xstep], &p11);
+
+	float p0 = p00 + (p10 - p00)*xoffset;
+	float p1 = p01 + (p11 - p01)*xoffset;
+
+	return p0 + (p1 - p0)*yoffset;
+}
+
+namespace {
+	uint32 Lerp8888(uint32 p0, uint32 p1, uint32 p2, uint32 p3, uint32 xf, uint32 yf) {
+		uint32 rb0 = p0 & 0x00ff00ff;
+		uint32 ag0 = p0 & 0xff00ff00;
+		uint32 rb1 = p1 & 0x00ff00ff;
+		uint32 ag1 = p1 & 0xff00ff00;
+		uint32 rb2 = p2 & 0x00ff00ff;
+		uint32 ag2 = p2 & 0xff00ff00;
+		uint32 rb3 = p3 & 0x00ff00ff;
+		uint32 ag3 = p3 & 0xff00ff00;
+
+		uint32 rbt = (rb0 + (((       rb1 - rb0       )*xf + 0x00800080) >> 8)) & 0x00ff00ff;
+		uint32 agt = (ag0 + ((((ag1 >> 8) - (ag0 >> 8))*xf + 0x00800080)     )) & 0xff00ff00;
+		uint32 rbb = (rb2 + (((       rb3 - rb2       )*xf + 0x00800080) >> 8)) & 0x00ff00ff;
+		uint32 agb = (ag2 + ((((ag3 >> 8) - (ag2 >> 8))*xf + 0x00800080)     )) & 0xff00ff00;
+		uint32 rb  = (rbt + (((       rbb - rbt       )*yf + 0x00800080) >> 8)) & 0x00ff00ff;
+		uint32 ag  = (agt + ((((agb >> 8) - (agt >> 8))*yf + 0x00800080)     )) & 0xff00ff00;
+
+		return rb + ag;
+	}
+
+	uint32 InterpPlanarY8(const VDPixmap& px, sint32 x1, sint32 y1) {
+		sint32 y = VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x1, y1);
+
+		return VDClampedRoundFixedToUint8Fast((float)(y-0x100000) * (1.1643836f/65536.0f/255.0f))*0x010101;
+	}
+
+	uint32 InterpPlanarYCC888(const VDPixmap& px, sint32 x1, sint32 y1, sint32 x23, sint32 y23, uint32 w23, uint32 h23) {
+		float y  = (float)(sint32)VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x1, y1);
+		float cb = (float)(sint32)VDPixmapInterpolateSample8To24(px.data2, px.pitch2, w23, h23, x23, y23);
+		float cr = (float)(sint32)VDPixmapInterpolateSample8To24(px.data3, px.pitch3, w23, h23, x23, y23);
+
+		//	!   1.1643836  - 5.599D-17    1.5960268  - 222.92157 !
+		//	!   1.1643836  - 0.3917623  - 0.8129676    135.57529 !
+		//	!   1.1643836    2.0172321  - 1.110D-16  - 276.83585 !
+		uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.5960268f/65536.0f/255.0f)*cr - (222.92157f / 255.0f));
+		uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.3917623f/65536.0f/255.0f)*cb - (0.8129676f/65536.0f/255.0f)*cr + (135.57529f / 255.0f));
+		uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.0172321f/65536.0f/255.0f)*cb - (276.83585f / 255.0f));
+
+		return (ir << 16) + (ig << 8) + ib;
+	}
+
+	uint32 ConvertYCC72ToRGB24(sint32 iy, sint32 icb, sint32 icr) {
+		float y  = (float)iy;
+		float cb = (float)icb;
+		float cr = (float)icr;
+
+		//	!   1.1643836  - 5.599D-17    1.5960268  - 222.92157 !
+		//	!   1.1643836  - 0.3917623  - 0.8129676    135.57529 !
+		//	!   1.1643836    2.0172321  - 1.110D-16  - 276.83585 !
+		uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.5960268f/65536.0f/255.0f)*cr - (222.92157f / 255.0f));
+		uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.3917623f/65536.0f/255.0f)*cb - (0.8129676f/65536.0f/255.0f)*cr + (135.57529f / 255.0f));
+		uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.0172321f/65536.0f/255.0f)*cb - (276.83585f / 255.0f));
+
+		return (ir << 16) + (ig << 8) + ib;
+	}
+
+	uint32 ConvertYCC72ToRGB24_709(sint32 iy, sint32 icb, sint32 icr) {
+		float y  = (float)iy;
+		float cb = (float)icb;
+		float cr = (float)icr;
+
+		//	!   1.1643836  - 2.932D-17    1.7927411  - 248.10099 !
+		//	!   1.1643836  - 0.2132486  - 0.5329093    76.87808  !
+		//	!   1.1643836    2.1124018  - 5.551D-17  - 289.01757 !
+		uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.7927411f/65536.0f/255.0f)*cr - (248.10099f / 255.0f));
+		uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.2132486f/65536.0f/255.0f)*cb - (0.5329093f/65536.0f/255.0f)*cr + (76.87808f / 255.0f));
+		uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.1124018f/65536.0f/255.0f)*cb - (289.01757f / 255.0f));
+
+		return (ir << 16) + (ig << 8) + ib;
+	}
+
+	uint32 SampleV210_Y(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+		const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 6)*4;
+
+		switch((uint32)x % 6) {
+			default:
+			case 0:	return (p[0] >> 10) & 0x3ff;
+			case 1:	return (p[1] >>  0) & 0x3ff;
+			case 2:	return (p[1] >> 20) & 0x3ff;
+			case 3:	return (p[2] >> 10) & 0x3ff;
+			case 4:	return (p[3] >>  0) & 0x3ff;
+			case 5:	return (p[3] >> 20) & 0x3ff;
+		}
+	}
+
+	uint32 SampleV210_Cb(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+		const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 3)*4;
+
+		switch((uint32)x % 3) {
+			default:
+			case 0:	return (p[0] >>  0) & 0x3ff;
+			case 1:	return (p[1] >> 10) & 0x3ff;
+			case 2:	return (p[2] >> 20) & 0x3ff;
+		}
+	}
+
+	uint32 SampleV210_Cr(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+		const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 3)*4;
+
+		switch((uint32)x % 3) {
+			default:
+			case 0:	return (p[0] >> 20) & 0x3ff;
+			case 1:	return (p[2] >>  0) & 0x3ff;
+			case 2:	return (p[3] >> 10) & 0x3ff;
+		}
+	}
+}
+
+uint32 VDPixmapInterpolateSampleRGB24(const VDPixmap& px, sint32 x_256, sint32 y_256) {
+	switch(px.format) {
+		case nsVDPixmap::kPixFormat_Pal1:
+		case nsVDPixmap::kPixFormat_Pal2:
+		case nsVDPixmap::kPixFormat_Pal4:
+		case nsVDPixmap::kPixFormat_Pal8:
+		case nsVDPixmap::kPixFormat_RGB565:
+		case nsVDPixmap::kPixFormat_RGB888:
+		case nsVDPixmap::kPixFormat_XRGB1555:
+		case nsVDPixmap::kPixFormat_XRGB8888:
+			{
+				x_256 -= 128;
+				y_256 -= 128;
+				int ix = x_256 >> 8;
+				int iy = y_256 >> 8;
+				uint32 p0 = VDPixmapSample(px, ix, iy);
+				uint32 p1 = VDPixmapSample(px, ix+1, iy);
+				uint32 p2 = VDPixmapSample(px, ix, iy+1);
+				uint32 p3 = VDPixmapSample(px, ix+1, iy+1);
+
+				return Lerp8888(p0, p1, p2, p3, x_256 & 255, y_256 & 255);
+			}
+			break;
+
+		case nsVDPixmap::kPixFormat_Y8:
+			return InterpPlanarY8(px, x_256, y_256); 
+
+		case nsVDPixmap::kPixFormat_YUV422_UYVY:
+			return ConvertYCC72ToRGB24(
+					VDPixmapInterpolateSample8x2To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+				);
+
+		case nsVDPixmap::kPixFormat_YUV422_YUYV:
+			return ConvertYCC72ToRGB24(
+					VDPixmapInterpolateSample8x2To24((const char *)px.data + 0, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 1, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 3, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+				);
+
+		case nsVDPixmap::kPixFormat_YUV444_XVYU:
+			return ConvertYCC72ToRGB24(
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, px.w, px.h, x_256, y_256)
+				);
+
+		case nsVDPixmap::kPixFormat_YUV422_UYVY_709:
+			return ConvertYCC72ToRGB24_709(
+					VDPixmapInterpolateSample8x2To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+					VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+				);
+
+		case nsVDPixmap::kPixFormat_YUV420_NV12:
+			return ConvertYCC72ToRGB24(
+					VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x_256, y_256),
+					VDPixmapInterpolateSample8x2To24((const char *)px.data2 + 0, px.pitch2, (px.w + 1) >> 1, (px.h + 1) >> 1, (x_256 >> 1) + 128, y_256 >> 1),
+					VDPixmapInterpolateSample8x2To24((const char *)px.data2 + 1, px.pitch2, (px.w + 1) >> 1, (px.h + 1) >> 1, (x_256 >> 1) + 128, y_256 >> 1)
+				);
+
+		case nsVDPixmap::kPixFormat_YUV444_Planar:
+			return InterpPlanarYCC888(px, x_256, y_256, x_256, y_256, px.w, px.h);
+
+		case nsVDPixmap::kPixFormat_YUV422_Planar:
+			return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 1) + 128, y_256, (px.w + 1) >> 1, px.h);
+
+		case nsVDPixmap::kPixFormat_YUV411_Planar:
+			return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 2) + 128, y_256, (px.w + 3) >> 2, px.h);
+
+		case nsVDPixmap::kPixFormat_YUV420_Planar:
+			return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 1) + 128, y_256 >> 1, (px.w + 1) >> 1, (px.h + 1) >> 1);
+
+		case nsVDPixmap::kPixFormat_YUV410_Planar:
+			return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 2) + 128, y_256 >> 2, (px.w + 3) >> 2, (px.h + 3) >> 2);
+
+		case nsVDPixmap::kPixFormat_YUV420_Planar_Centered:
+			return InterpPlanarYCC888(px, x_256, y_256, x_256 >> 1, y_256 >> 1, (px.w + 1) >> 1, (px.h + 1) >> 1);
+
+		case nsVDPixmap::kPixFormat_YUV422_Planar_Centered:
+			return InterpPlanarYCC888(px, x_256, y_256, x_256 >> 1, y_256, (px.w + 1) >> 1, px.h);
+
+		case nsVDPixmap::kPixFormat_YUV422_Planar_16F:
+			{
+				float y  = VDPixmapInterpolateSample16F(px.data, px.pitch, px.w, px.h, x_256, y_256);
+				float cb = VDPixmapInterpolateSample16F(px.data2, px.pitch2, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256);
+				float cr = VDPixmapInterpolateSample16F(px.data3, px.pitch3, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256);
+
+				uint32 ir = VDClampedRoundFixedToUint8Fast(1.1643836f*y + 1.5960268f*cr - (222.92157f / 255.0f));
+				uint32 ig = VDClampedRoundFixedToUint8Fast(1.1643836f*y - 0.3917623f*cb - 0.8129676f*cr + (135.57529f / 255.0f));
+				uint32 ib = VDClampedRoundFixedToUint8Fast(1.1643836f*y + 2.0172321f*cb - (276.83585f / 255.0f));
+
+				return (ir << 16) + (ig << 8) + ib;
+			}
+
+		case nsVDPixmap::kPixFormat_YUV422_V210:
+			{
+				sint32 luma_x = x_256 - 128;
+				sint32 luma_y = y_256 - 128;
+
+				if (luma_x < 0)
+					luma_x = 0;
+
+				if (luma_y < 0)
+					luma_y = 0;
+
+				if (luma_x > (sint32)((px.w - 1) << 8))
+					luma_x = (sint32)((px.w - 1) << 8);
+
+				if (luma_y > (sint32)((px.h - 1) << 8))
+					luma_y = (sint32)((px.h - 1) << 8);
+
+				sint32 luma_ix = luma_x >> 8;
+				sint32 luma_iy = luma_y >> 8;
+				float luma_fx = (float)(luma_x & 255) * (1.0f / 255.0f);
+				float luma_fy = (float)(luma_y & 255) * (1.0f / 255.0f);
+
+				float y0 = SampleV210_Y(px.data, px.pitch, luma_ix+0, luma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float y1 = SampleV210_Y(px.data, px.pitch, luma_ix+1, luma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float y2 = SampleV210_Y(px.data, px.pitch, luma_ix+0, luma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float y3 = SampleV210_Y(px.data, px.pitch, luma_ix+1, luma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float yt = y0 + (y1 - y0)*luma_fx;
+				float yb = y2 + (y3 - y2)*luma_fx;
+				float yr = yt + (yb - yt)*luma_fy;
+
+				uint32 chroma_w = (px.w + 1) >> 1;
+				uint32 chroma_h = px.h;
+				sint32 chroma_x = x_256 >> 1;
+				sint32 chroma_y = y_256 - 128;
+
+				if (chroma_x < 0)
+					chroma_x = 0;
+
+				if (chroma_y < 0)
+					chroma_y = 0;
+
+				if (chroma_x > (sint32)((chroma_w - 1) << 8))
+					chroma_x = (sint32)((chroma_w - 1) << 8);
+
+				if (chroma_y > (sint32)((chroma_h - 1) << 8))
+					chroma_y = (sint32)((chroma_h - 1) << 8);
+
+				sint32 chroma_ix = chroma_x >> 8;
+				sint32 chroma_iy = chroma_y >> 8;
+				float chroma_fx = (float)(chroma_x & 255) * (1.0f / 255.0f);
+				float chroma_fy = (float)(chroma_y & 255) * (1.0f / 255.0f);
+
+				float cb0 = SampleV210_Cb(px.data, px.pitch, chroma_ix+0, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float cb1 = SampleV210_Cb(px.data, px.pitch, chroma_ix+1, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float cb2 = SampleV210_Cb(px.data, px.pitch, chroma_ix+0, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float cb3 = SampleV210_Cb(px.data, px.pitch, chroma_ix+1, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float cbt = cb0 + (cb1 - cb0)*chroma_fx;
+				float cbb = cb2 + (cb3 - cb2)*chroma_fx;
+				float cbr = cbt + (cbb - cbt)*chroma_fy;
+
+				float cr0 = SampleV210_Cr(px.data, px.pitch, chroma_ix+0, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float cr1 = SampleV210_Cr(px.data, px.pitch, chroma_ix+1, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+				float cr2 = SampleV210_Cr(px.data, px.pitch, chroma_ix+0, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float cr3 = SampleV210_Cr(px.data, px.pitch, chroma_ix+1, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+				float crt = cr0 + (cr1 - cr0)*chroma_fx;
+				float crb = cr2 + (cr3 - cr2)*chroma_fx;
+				float crr = crt + (crb - crt)*chroma_fy;
+
+				uint32 ir = VDClampedRoundFixedToUint8Fast(1.1643836f*yr + 1.5960268f*crr - (222.92157f / 255.0f));
+				uint32 ig = VDClampedRoundFixedToUint8Fast(1.1643836f*yr - 0.3917623f*cbr - 0.8129676f*crr + (135.57529f / 255.0f));
+				uint32 ib = VDClampedRoundFixedToUint8Fast(1.1643836f*yr + 2.0172321f*cbr - (276.83585f / 255.0f));
+
+				return (ir << 16) + (ig << 8) + ib;
+			}
+			break;
+
+		default:
+			return 0;
+	}
+}
+
+uint32 VDConvertYCbCrToRGB(uint8 y0, uint8 cb0, uint8 cr0) {
+	sint32  y =  y0 -  16;
+	sint32 cb = cb0 - 128;
+	sint32 cr = cr0 - 128;
+
+	sint32 y2 = y * 76309 + 0x8000;
+	sint32 r = y2 + cr * 104597;
+	sint32 g = y2 + cr * -53279 + cb * -25674;
+	sint32 b = y2 + cb * 132201;
+
+	r &= ~(r >> 31);
+	g &= ~(g >> 31);
+	b &= ~(b >> 31);
+	r += (0xffffff - r) & ((0xffffff - r) >> 31);
+	g += (0xffffff - g) & ((0xffffff - g) >> 31);
+	b += (0xffffff - b) & ((0xffffff - b) >> 31);
+
+	return (r & 0xff0000) + ((g & 0xff0000) >> 8) + (b >> 16);
+}
+
+uint32 VDConvertRGBToYCbCr(uint32 c) {
+	return VDConvertRGBToYCbCr((uint8)(c >> 16), (uint8)(c >> 8), (uint8)c);
+}
+
+uint32 VDConvertRGBToYCbCr(uint8 r8, uint8 g8, uint8 b8) {
+	sint32 r  = r8;
+	sint32 g  = g8;
+	sint32 b  = b8;
+	sint32 yt = 1052*r + 2065*g + 401*b;
+	sint32 y  = (yt + 0x10800) >> 4;
+	sint32 cr = (10507932*r - yt*2987 + 0x80800000U) >> 8;
+	sint32 cb = ( 8312025*b - yt*2363 + 0x80800000U) >> 24;
+
+	return (uint8)cb + (y & 0xff00) + (cr&0xff0000);
+}
+\ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp b/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp
new file mode 100644
index 000000000..635cbf3c0
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp
@@ -0,0 +1,519 @@
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/system/memory.h>
+
+extern VDPixmapFormatInfo g_vdPixmapFormats[] = {
+									// name         qchnk qw qh qwb qhb  qs ab aw ah as   ps
+	/* Null */						{ "null",		false, 1, 1,  0,  0,  0, 0, 0, 0, 0,   0 },
+	/* Pal1 */						{ "Pal1",		 true, 8, 1,  3,  0,  1, 0, 0, 0, 0,   2 },
+	/* Pal2 */						{ "Pal2",		 true, 4, 1,  2,  0,  1, 0, 0, 0, 0,   4 },
+	/* Pal4 */						{ "Pal4",		 true, 2, 1,  1,  0,  1, 0, 0, 0, 0,  16 },
+	/* Pal8 */						{ "Pal8",		false, 1, 1,  0,  0,  1, 0, 0, 0, 0, 256 },
+	/* RGB16_555 */					{ "XRGB1555",	false, 1, 1,  0,  0,  2, 0, 0, 0, 0,   0 },
+	/* RGB16_565 */					{ "RGB565",		false, 1, 1,  0,  0,  2, 0, 0, 0, 0,   0 },
+	/* RGB24 */						{ "RGB888",		false, 1, 1,  0,  0,  3, 0, 0, 0, 0,   0 },
+	/* RGB32 */						{ "XRGB8888",	false, 1, 1,  0,  0,  4, 0, 0, 0, 0,   0 },
+	/* Y8 */						{ "Y8",			false, 1, 1,  0,  0,  1, 0, 0, 0, 0,   0 },
+	/* YUV422_UYVY */				{ "UYVY",		 true, 2, 1,  1,  0,  4, 0, 0, 0, 0,   0 },
+	/* YUV422_YUYV */				{ "YUYV",		 true, 2, 1,  1,  0,  4, 0, 0, 0, 0,   0 },
+	/* YUV444_XVYU */				{ "XVYU",		false, 1, 1,  0,  0,  4, 0, 0, 0, 0,   0 },
+	/* YUV444_Planar */				{ "YUV444",		false, 1, 1,  0,  0,  1, 2, 0, 0, 1,   0 },
+	/* YUV422_Planar */				{ "YUV422",		false, 1, 1,  0,  0,  1, 2, 1, 0, 1,   0 },
+	/* YUV420_Planar */				{ "YUV420",		false, 1, 1,  0,  0,  1, 2, 1, 1, 1,   0 },
+	/* YUV411_Planar */				{ "YUV411",		false, 1, 1,  0,  0,  1, 2, 2, 0, 1,   0 },
+	/* YUV410_Planar */				{ "YUV410",		false, 1, 1,  0,  0,  1, 2, 2, 2, 1,   0 },
+	/* YUV422_Planar_Centered */	{ "YUV422C",	false, 1, 1,  0,  0,  1, 2, 1, 0, 1,   0 },
+	/* YUV420_Planar_Centered */	{ "YUV420C",	false, 1, 1,  0,  0,  1, 2, 1, 1, 1,   0 },
+	/* YUV422_Planar_16F */			{ "YUV422_16F",	false, 1, 1,  0,  0,  2, 2, 1, 0, 2,   0 },
+	/* V210 */						{ "v210",		 true,24, 1,  2,  0, 64, 0, 0, 0, 1,   0 },
+	/* YUV422_UYVY_709 */			{ "UYVY-709",	 true, 2, 1,  1,  0,  4, 0, 0, 0, 0,   0 },
+	/* NV12 */						{ "NV12",		false, 1, 1,  0,  0,  1, 1, 1, 1, 2,   0 },
+};
+
+#ifdef _DEBUG
+	bool VDIsValidPixmapPlane(const void *p, ptrdiff_t pitch, vdpixsize w, vdpixsize h) {
+		bool isvalid;
+
+		if (pitch < 0)
+			isvalid = VDIsValidReadRegion((const char *)p + pitch*(h-1), (-pitch)*(h-1)+w);
+		else
+			isvalid = VDIsValidReadRegion(p, pitch*(h-1)+w);
+
+		if (!isvalid) {
+			VDDEBUG("Kasumi: Invalid pixmap plane detected.\n"
+					"        Base=%p, pitch=%d, size=%dx%d (bytes)\n", p, (int)pitch, w, h);
+		}
+
+		return isvalid;
+	}
+
+	bool VDAssertValidPixmap(const VDPixmap& px) {
+		const VDPixmapFormatInfo& info = VDPixmapGetInfo(px.format);
+
+		if (px.format) {
+			if (!VDIsValidPixmapPlane(px.data, px.pitch, -(-px.w / info.qw)*info.qsize, -(-px.h >> info.qhbits))) {
+				VDDEBUG("Kasumi: Invalid primary plane detected in pixmap.\n"
+						"        Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+				VDASSERT(!"Kasumi: Invalid primary plane detected in pixmap.\n");
+				return false;
+			}
+
+			if (info.palsize)
+				if (!VDIsValidReadRegion(px.palette, sizeof(uint32) * info.palsize)) {
+					VDDEBUG("Kasumi: Invalid palette detected in pixmap.\n"
+							"        Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+					VDASSERT(!"Kasumi: Invalid palette detected in pixmap.\n");
+					return false;
+				}
+
+			if (info.auxbufs) {
+				const vdpixsize auxw = -(-px.w >> info.auxwbits);
+				const vdpixsize auxh = -(-px.h >> info.auxhbits);
+
+				if (!VDIsValidPixmapPlane(px.data2, px.pitch2, auxw * info.auxsize, auxh)) {
+					VDDEBUG("Kasumi: Invalid Cb plane detected in pixmap.\n"
+							"        Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+					VDASSERT(!"Kasumi: Invalid Cb plane detected in pixmap.\n");
+					return false;
+				}
+
+				if (info.auxbufs > 2) {
+					if (!VDIsValidPixmapPlane(px.data3, px.pitch3, auxw * info.auxsize, auxh)) {
+						VDDEBUG("Kasumi: Invalid Cr plane detected in pixmap.\n"
+								"        Pixmap info: format=%d, dimensions=%dx%d\n", px.format, px.w, px.h);
+						VDASSERT(!"Kasumi: Invalid Cr plane detected in pixmap.\n");
+						return false;
+					}
+				}
+			}
+		}
+
+		return true;
+	}
+#endif
+
+VDPixmap VDPixmapOffset(const VDPixmap& src, vdpixpos x, vdpixpos y) {
+	VDPixmap temp(src);
+	const VDPixmapFormatInfo& info = VDPixmapGetInfo(temp.format);
+
+	if (info.qchunky) {
+		x = (x + info.qw - 1) / info.qw;
+		y >>= info.qhbits;
+	}
+
+	switch(info.auxbufs) {
+	case 2:
+		temp.data3 = (char *)temp.data3 + (x >> info.auxwbits)*info.auxsize + (y >> info.auxhbits)*temp.pitch3;
+	case 1:
+		temp.data2 = (char *)temp.data2 + (x >> info.auxwbits)*info.auxsize + (y >> info.auxhbits)*temp.pitch2;
+	case 0:
+		temp.data = (char *)temp.data + x*info.qsize + y*temp.pitch;
+	}
+
+	return temp;
+}
+
+VDPixmapLayout VDPixmapLayoutOffset(const VDPixmapLayout& src, vdpixpos x, vdpixpos y) {
+	VDPixmapLayout temp(src);
+	const VDPixmapFormatInfo& info = VDPixmapGetInfo(temp.format);
+
+	if (info.qchunky) {
+		x = (x + info.qw - 1) / info.qw;
+		y = -(-y >> info.qhbits);
+	}
+
+	switch(info.auxbufs) {
+	case 2:
+		temp.data3 += -(-x >> info.auxwbits)*info.auxsize + -(-y >> info.auxhbits)*temp.pitch3;
+	case 1:
+		temp.data2 += -(-x >> info.auxwbits)*info.auxsize + -(-y >> info.auxhbits)*temp.pitch2;
+	case 0:
+		temp.data += x*info.qsize + y*temp.pitch;
+	}
+
+	return temp;
+}
+
+uint32 VDPixmapCreateLinearLayout(VDPixmapLayout& layout, int format, vdpixsize w, vdpixsize h, int alignment) {
+	const ptrdiff_t alignmask = alignment - 1;
+
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(format);
+	sint32		qw			= (w + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-h >> srcinfo.qhbits);
+	sint32		subw		= -(-w >> srcinfo.auxwbits);
+	sint32		subh		= -(-h >> srcinfo.auxhbits);
+	sint32		auxsize		= srcinfo.auxsize;
+
+	ptrdiff_t	mainpitch	= (srcinfo.qsize * qw + alignmask) & ~alignmask;
+	size_t		mainsize	= mainpitch * qh;
+
+	layout.data		= 0;
+	layout.pitch	= mainpitch;
+	layout.palette	= NULL;
+	layout.data2	= 0;
+	layout.pitch2	= 0;
+	layout.data3	= 0;
+	layout.pitch3	= 0;
+	layout.w		= w;
+	layout.h		= h;
+	layout.format	= format;
+
+	if (srcinfo.auxbufs >= 1) {
+		ptrdiff_t	subpitch	= (subw * auxsize + alignmask) & ~alignmask;
+		size_t		subsize		= subpitch * subh;
+
+		layout.data2	= mainsize;
+		layout.pitch2	= subpitch;
+		mainsize += subsize;
+
+		if (srcinfo.auxbufs >= 2) {
+			layout.data3	= mainsize;
+			layout.pitch3	= subpitch;
+			mainsize += subsize;
+		}
+	}
+
+	return mainsize;
+}
+
+void VDPixmapFlipV(VDPixmap& px) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(px.format);
+	sint32		w			= px.w;
+	sint32		h			= px.h;
+	sint32		qw			= (w + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-h >> srcinfo.qhbits);
+	sint32		subh		= -(-h >> srcinfo.auxhbits);
+
+	vdptrstep(px.data, px.pitch * (qh - 1));
+	px.pitch = -px.pitch;
+
+	if (srcinfo.auxbufs >= 1) {
+		vdptrstep(px.data2, px.pitch2 * (subh - 1));
+		px.pitch2 = -px.pitch2;
+
+		if (srcinfo.auxbufs >= 2) {
+			vdptrstep(px.data3, px.pitch3 * (subh - 1));
+			px.pitch3 = -px.pitch3;
+		}
+	}
+}
+
+void VDPixmapLayoutFlipV(VDPixmapLayout& layout) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+	sint32		w			= layout.w;
+	sint32		h			= layout.h;
+	sint32		qw			= (w + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-h >> srcinfo.qhbits);
+	sint32		subh		= -(-h >> srcinfo.auxhbits);
+
+	layout.data += layout.pitch * (qh - 1);
+	layout.pitch = -layout.pitch;
+
+	if (srcinfo.auxbufs >= 1) {
+		layout.data2 += layout.pitch2 * (subh - 1);
+		layout.pitch2 = -layout.pitch2;
+
+		if (srcinfo.auxbufs >= 2) {
+			layout.data3 += layout.pitch3 * (subh - 1);
+			layout.pitch3 = -layout.pitch3;
+		}
+	}
+}
+
+uint32 VDPixmapLayoutGetMinSize(const VDPixmapLayout& layout) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+	sint32		w			= layout.w;
+	sint32		h			= layout.h;
+	sint32		qw			= (w + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-h >> srcinfo.qhbits);
+	sint32		subh		= -(-h >> srcinfo.auxhbits);
+
+	uint32 limit = layout.data;
+	if (layout.pitch >= 0)
+		limit += layout.pitch * qh;
+	else
+		limit -= layout.pitch;
+
+	if (srcinfo.auxbufs >= 1) {
+		uint32 limit2 = layout.data2;
+
+		if (layout.pitch2 >= 0)
+			limit2 += layout.pitch2 * subh;
+		else
+			limit2 -= layout.pitch2;
+
+		if (limit < limit2)
+			limit = limit2;
+
+		if (srcinfo.auxbufs >= 2) {
+			uint32 limit3 = layout.data3;
+
+			if (layout.pitch3 >= 0)
+				limit3 += layout.pitch3 * subh;
+			else
+				limit3 -= layout.pitch3;
+
+			if (limit < limit3)
+				limit = limit3;
+		}
+	}
+
+	return limit;
+}
+
+VDPixmap VDPixmapExtractField(const VDPixmap& src, bool field2) {
+	VDPixmap px(src);
+
+	if (field2) {
+		const VDPixmapFormatInfo& info = VDPixmapGetInfo(px.format);
+
+		if (px.data) {
+			if (info.qh == 1)
+				vdptrstep(px.data, px.pitch);
+
+			if (!info.auxhbits) {
+				vdptrstep(px.data2, px.pitch2);
+				vdptrstep(px.data3, px.pitch3);
+			}
+		}
+	}
+
+	px.h >>= 1;
+	px.pitch += px.pitch;
+	px.pitch2 += px.pitch2;
+	px.pitch3 += px.pitch3;
+	return px;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmap& src)
+	: mpBuffer(NULL)
+	, mLinearSize(0)
+{
+	assign(src);
+}
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmapBuffer& src)
+	: mpBuffer(NULL)
+	, mLinearSize(0)
+{
+	assign(src);
+}
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmapLayout& layout) {
+	init(layout);
+}
+
+VDPixmapBuffer::~VDPixmapBuffer() {
+#ifdef _DEBUG
+	validate();
+#endif
+
+	delete[] mpBuffer;
+}
+
+void VDPixmapBuffer::init(sint32 width, sint32 height, int f) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(f);
+	sint32		qw			= (width + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-height >> srcinfo.qhbits);
+	sint32		subw		= -(-width >> srcinfo.auxwbits);
+	sint32		subh		= -(-height >> srcinfo.auxhbits);
+	ptrdiff_t	mainpitch	= (srcinfo.qsize * qw + 15) & ~15;
+	ptrdiff_t	subpitch	= (srcinfo.auxsize * subw + 15) & ~15;
+	size_t		mainsize	= mainpitch * qh;
+	size_t		subsize		= subpitch * subh;
+	size_t		totalsize	= mainsize + subsize*srcinfo.auxbufs + 4 * srcinfo.palsize;
+
+#ifdef _DEBUG
+	totalsize += 28;
+#endif
+
+	if (mLinearSize != totalsize) {
+		clear();
+		mpBuffer = new char[totalsize + 15];
+		mLinearSize = totalsize;
+	}
+
+	char *p = mpBuffer + (-(int)(uintptr)mpBuffer & 15);
+
+#ifdef _DEBUG
+	*(uint32 *)p = totalsize;
+	for(int i=0; i<12; ++i)
+		p[4+i] = (char)(0xa0 + i);
+
+	p += 16;
+#endif
+
+	data	= p;
+	pitch	= mainpitch;
+	p += mainsize;
+
+	palette	= NULL;
+	data2	= NULL;
+	pitch2	= NULL;
+	data3	= NULL;
+	pitch3	= NULL;
+	w		= width;
+	h		= height;
+	format	= f;
+
+	if (srcinfo.auxbufs >= 1) {
+		data2	= p;
+		pitch2	= subpitch;
+		p += subsize;
+	}
+
+	if (srcinfo.auxbufs >= 2) {
+		data3	= p;
+		pitch3	= subpitch;
+		p += subsize;
+	}
+
+	if (srcinfo.palsize) {
+		palette = (const uint32 *)p;
+		p += srcinfo.palsize * 4;
+	}
+
+#ifdef _DEBUG
+	for(int j=0; j<12; ++j)
+		p[j] = (char)(0xb0 + j);
+#endif
+}
+
+void VDPixmapBuffer::init(const VDPixmapLayout& layout) {
+	const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+	sint32		qw			= (layout.w + srcinfo.qw - 1) / srcinfo.qw;
+	sint32		qh			= -(-layout.h >> srcinfo.qhbits);
+	sint32		subw		= -(-layout.w >> srcinfo.auxwbits);
+	sint32		subh		= -(-layout.h >> srcinfo.auxhbits);
+
+	ptrdiff_t mino=0, maxo=0;
+
+	if (layout.pitch < 0) {
+		mino = std::min<ptrdiff_t>(mino, layout.data + layout.pitch * (qh-1));
+		maxo = std::max<ptrdiff_t>(maxo, layout.data - layout.pitch);
+	} else {
+		mino = std::min<ptrdiff_t>(mino, layout.data);
+		maxo = std::max<ptrdiff_t>(maxo, layout.data + layout.pitch*qh);
+	}
+
+	if (srcinfo.auxbufs >= 1) {
+		if (layout.pitch2 < 0) {
+			mino = std::min<ptrdiff_t>(mino, layout.data2 + layout.pitch2 * (subh-1));
+			maxo = std::max<ptrdiff_t>(maxo, layout.data2 - layout.pitch2);
+		} else {
+			mino = std::min<ptrdiff_t>(mino, layout.data2);
+			maxo = std::max<ptrdiff_t>(maxo, layout.data2 + layout.pitch2*subh);
+		}
+
+		if (srcinfo.auxbufs >= 2) {
+			if (layout.pitch3 < 0) {
+				mino = std::min<ptrdiff_t>(mino, layout.data3 + layout.pitch3 * (subh-1));
+				maxo = std::max<ptrdiff_t>(maxo, layout.data3 - layout.pitch3);
+			} else {
+				mino = std::min<ptrdiff_t>(mino, layout.data3);
+				maxo = std::max<ptrdiff_t>(maxo, layout.data3 + layout.pitch3*subh);
+			}
+		}
+	}
+
+	ptrdiff_t linsize = ((maxo - mino + 3) & ~(uintptr)3);
+
+	ptrdiff_t totalsize = linsize + 4*srcinfo.palsize;
+
+#ifdef _DEBUG
+	totalsize += 28;
+#endif
+
+	if (mLinearSize != totalsize) {
+		clear();
+		mpBuffer = new char[totalsize + 15];
+		mLinearSize = totalsize;
+	}
+
+	char *p = mpBuffer + (-(int)(uintptr)mpBuffer & 15);
+
+#ifdef _DEBUG
+	*(uint32 *)p = totalsize - 28;
+	for(int i=0; i<12; ++i)
+		p[4+i] = (char)(0xa0 + i);
+
+	p += 16;
+#endif
+
+	w		= layout.w;
+	h		= layout.h;
+	format	= layout.format;
+	data	= p + layout.data - mino;
+	data2	= p + layout.data2 - mino;
+	data3	= p + layout.data3 - mino;
+	pitch	= layout.pitch;
+	pitch2	= layout.pitch2;
+	pitch3	= layout.pitch3;
+	palette	= NULL;
+
+	if (srcinfo.palsize) {
+		palette = (const uint32 *)(p + linsize);
+		memcpy((void *)palette, layout.palette, 4*srcinfo.palsize);
+	}
+
+#ifdef _DEBUG
+	for(int j=0; j<12; ++j)
+		p[totalsize + j - 28] = (char)(0xb0 + j);
+#endif
+
+	VDAssertValidPixmap(*this);
+}
+
+void VDPixmapBuffer::assign(const VDPixmap& src) {
+	if (!src.format) {
+		delete[] mpBuffer;
+		mpBuffer = NULL;
+		data = NULL;
+		format = 0;
+	} else {
+		init(src.w, src.h, src.format);
+
+		const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+		int qw = (src.w + srcinfo.qw - 1) / srcinfo.qw;
+		int qh = -(-src.h >> srcinfo.qhbits);
+		int subw = -(-src.w >> srcinfo.auxwbits);
+		int subh = -(-src.h >> srcinfo.auxhbits);
+
+		if (srcinfo.palsize)
+			memcpy((void *)palette, src.palette, 4 * srcinfo.palsize);
+
+		switch(srcinfo.auxbufs) {
+		case 2:
+			VDMemcpyRect(data3, pitch3, src.data3, src.pitch3, subw, subh);
+		case 1:
+			VDMemcpyRect(data2, pitch2, src.data2, src.pitch2, subw, subh);
+		case 0:
+			VDMemcpyRect(data, pitch, src.data, src.pitch, qw * srcinfo.qsize, qh);
+		}
+	}
+}
+
+void VDPixmapBuffer::swap(VDPixmapBuffer& dst) {
+	std::swap(mpBuffer, dst.mpBuffer);
+	std::swap(mLinearSize, dst.mLinearSize);
+	std::swap(static_cast<VDPixmap&>(*this), static_cast<VDPixmap&>(dst));
+}
+
+#ifdef _DEBUG
+void VDPixmapBuffer::validate() {
+	if (mpBuffer) {
+		char *p = (char *)(((uintptr)mpBuffer + 15) & ~(uintptr)15);
+
+		// verify head bytes
+		for(int i=0; i<12; ++i)
+			if (p[i+4] != (char)(0xa0 + i))
+				VDASSERT(!"VDPixmapBuffer: Buffer underflow detected.\n");
+
+		// verify tail bytes
+		for(int j=0; j<12; ++j)
+			if (p[mLinearSize - 12 + j] != (char)(0xb0 + j))
+				VDASSERT(!"VDPixmapBuffer: Buffer overflow detected.\n");
+	}
+}
+#endif
+\ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/region.cpp b/src/thirdparty/VirtualDub/Kasumi/source/region.cpp
new file mode 100644
index 000000000..283f43cf8
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/region.cpp
@@ -0,0 +1,1334 @@
+//	VirtualDub - Video processing and capture application
+//	Graphics support library
+//	Copyright (C) 1998-2007 Avery Lee
+//
+//	This program is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program; if not, write to the Free Software
+//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/region.h>
+#include <vd2/system/math.h>
+#include <vd2/system/vdstl.h>
+
+void VDPixmapRegion::swap(VDPixmapRegion& x) {
+	mSpans.swap(x.mSpans);
+	std::swap(mBounds, x.mBounds);
+}
+
+VDPixmapPathRasterizer::VDPixmapPathRasterizer()
+	: mpEdgeBlocks(NULL)
+	, mpFreeEdgeBlocks(NULL)
+	, mEdgeBlockIdx(kEdgeBlockMax)
+	, mpScanBuffer(NULL)
+{
+	ClearScanBuffer();
+}
+
+VDPixmapPathRasterizer::VDPixmapPathRasterizer(const VDPixmapPathRasterizer&)
+	: mpEdgeBlocks(NULL)
+	, mpFreeEdgeBlocks(NULL)
+	, mEdgeBlockIdx(kEdgeBlockMax)
+	, mpScanBuffer(NULL)
+{
+	ClearScanBuffer();
+}
+
+VDPixmapPathRasterizer::~VDPixmapPathRasterizer() {
+	Clear();
+	FreeEdgeLists();
+}
+
+VDPixmapPathRasterizer& VDPixmapPathRasterizer::operator=(const VDPixmapPathRasterizer&) {
+	return *this;
+}
+
+void VDPixmapPathRasterizer::Clear() {
+	ClearEdgeList();
+	ClearScanBuffer();
+}
+
+void VDPixmapPathRasterizer::QuadraticBezier(const vdint2 *pts) {
+	int x0 = pts[0].x;
+	int x1 = pts[1].x;
+	int x2 = pts[2].x;
+	int y0 = pts[0].y;
+	int y1 = pts[1].y;
+	int y2 = pts[2].y;
+
+	// P = (1-t)^2*P0 + 2t(1-t)*P1 + t^2*P2
+	// P = (1-2t+t^2)P0 + 2(t-t^2)P1 + t^2*P2
+	// P = (P0-2P1+P2)t^2 + 2(P1-P0)t + P0
+
+	int cx2 =    x0-2*x1+x2;
+	int cx1 = -2*x0+2*x1;
+	int cx0 =    x0;
+
+	int cy2 =    y0-2*y1+y2;
+	int cy1 = -2*y0+2*y1;
+	int cy0 =    y0;
+
+	// This equation is from Graphics Gems I.
+	//
+	// The idea is that since we're approximating a cubic curve with lines,
+	// any error we incur is due to the curvature of the line, which we can
+	// estimate by calculating the maximum acceleration of the curve.  For
+	// a cubic, the acceleration (second derivative) is a line, meaning that
+	// the absolute maximum acceleration must occur at either the beginning
+	// (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
+	// conservative than that, but that's okay.
+	//
+	// If the acceleration of the parametric formula is zero (c2 = c3 = 0),
+	// that component of the curve is linear and does not incur any error.
+	// If a=0 for both X and Y, the curve is a line segment and we can
+	// use a step size of 1.
+
+	int maxaccel1 = abs(cy2);
+	int maxaccel2 = abs(cx2);
+
+	int maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
+	int h = 1;
+
+	while(maxaccel > 8 && h < 1024) {
+		maxaccel >>= 2;
+		h += h;
+	}
+
+	int lastx = x0;
+	int lasty = y0;
+
+	// compute forward differences
+	sint64 h1 = (sint64)(0x40000000 / h) << 2;
+	sint64 h2 = h1/h;
+
+	sint64 ax0 = (sint64)cx0 << 32;
+	sint64 ax1 =   h1*(sint64)cx1 +   h2*(sint64)cx2;
+	sint64 ax2 = 2*h2*(sint64)cx2;
+
+	sint64 ay0 = (sint64)cy0 << 32;
+	sint64 ay1 =   h1*(sint64)cy1 +   h2*(sint64)cy2;
+	sint64 ay2 = 2*h2*(sint64)cy2;
+
+	// round, not truncate
+	ax0 += 0x80000000;
+	ay0 += 0x80000000;
+
+	do {
+		ax0 += ax1;
+		ax1 += ax2;
+		ay0 += ay1;
+		ay1 += ay2;
+
+		int xi = (int)((uint64)ax0 >> 32);
+		int yi = (int)((uint64)ay0 >> 32);
+
+		FastLine(lastx, lasty, xi, yi);
+		lastx = xi;
+		lasty = yi;
+	} while(--h);
+}
+
+void VDPixmapPathRasterizer::CubicBezier(const vdint2 *pts) {
+	int x0 = pts[0].x;
+	int x1 = pts[1].x;
+	int x2 = pts[2].x;
+	int x3 = pts[3].x;
+	int y0 = pts[0].y;
+	int y1 = pts[1].y;
+	int y2 = pts[2].y;
+	int y3 = pts[3].y;
+
+	int cx3 = -  x0+3*x1-3*x2+x3;
+	int cx2 =  3*x0-6*x1+3*x2;
+	int cx1 = -3*x0+3*x1;
+	int cx0 =    x0;
+
+	int cy3 = -  y0+3*y1-3*y2+y3;
+	int cy2 =  3*y0-6*y1+3*y2;
+	int cy1 = -3*y0+3*y1;
+	int cy0 =    y0;
+
+	// This equation is from Graphics Gems I.
+	//
+	// The idea is that since we're approximating a cubic curve with lines,
+	// any error we incur is due to the curvature of the line, which we can
+	// estimate by calculating the maximum acceleration of the curve.  For
+	// a cubic, the acceleration (second derivative) is a line, meaning that
+	// the absolute maximum acceleration must occur at either the beginning
+	// (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
+	// conservative than that, but that's okay.
+	//
+	// If the acceleration of the parametric formula is zero (c2 = c3 = 0),
+	// that component of the curve is linear and does not incur any error.
+	// If a=0 for both X and Y, the curve is a line segment and we can
+	// use a step size of 1.
+
+	int maxaccel1 = abs(2*cy2) + abs(6*cy3);
+	int maxaccel2 = abs(2*cx2) + abs(6*cx3);
+
+	int maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
+	int h = 1;
+
+	while(maxaccel > 8 && h < 1024) {
+		maxaccel >>= 2;
+		h += h;
+	}
+
+	int lastx = x0;
+	int lasty = y0;
+
+	// compute forward differences
+	sint64 h1 = (sint64)(0x40000000 / h) << 2;
+	sint64 h2 = h1/h;
+	sint64 h3 = h2/h;
+
+	sint64 ax0 = (sint64)cx0 << 32;
+	sint64 ax1 =   h1*(sint64)cx1 +   h2*(sint64)cx2 + h3*(sint64)cx3;
+	sint64 ax2 = 2*h2*(sint64)cx2 + 6*h3*(sint64)cx3;
+	sint64 ax3 = 6*h3*(sint64)cx3;
+
+	sint64 ay0 = (sint64)cy0 << 32;
+	sint64 ay1 =   h1*(sint64)cy1 +   h2*(sint64)cy2 + h3*(sint64)cy3;
+	sint64 ay2 = 2*h2*(sint64)cy2 + 6*h3*(sint64)cy3;
+	sint64 ay3 = 6*h3*(sint64)cy3;
+
+	// round, not truncate
+	ax0 += 0x80000000;
+	ay0 += 0x80000000;
+
+	do {
+		ax0 += ax1;
+		ax1 += ax2;
+		ax2 += ax3;
+		ay0 += ay1;
+		ay1 += ay2;
+		ay2 += ay3;
+
+		int xi = (int)((uint64)ax0 >> 32);
+		int yi = (int)((uint64)ay0 >> 32);
+
+		FastLine(lastx, lasty, xi, yi);
+		lastx = xi;
+		lasty = yi;
+	} while(--h);
+}
+
+void VDPixmapPathRasterizer::Line(const vdint2& pt1, const vdint2& pt2) {
+	FastLine(pt1.x, pt1.y, pt2.x, pt2.y);
+}
+
+void VDPixmapPathRasterizer::FastLine(int x0, int y0, int x1, int y1) {
+	int flag = 1;
+
+	if (y1 == y0)
+		return;
+
+	if (y1 < y0) {
+		int t;
+
+		t=x0; x0=x1; x1=t;
+		t=y0; y0=y1; y1=t;
+		flag = 0;
+	}
+
+	int dy = y1-y0;
+	int xacc = x0<<13;
+
+	// prestep y0 down
+	int iy0 = (y0+3) >> 3;
+	int iy1 = (y1+3) >> 3;
+
+	if (iy0 < iy1) {
+		int invslope = (x1-x0)*65536/dy;
+
+		int prestep = (4-y0) & 7;
+		xacc += (invslope * prestep)>>3;
+
+		if (iy0 < mScanYMin || iy1 > mScanYMax) {
+			ReallocateScanBuffer(iy0, iy1);
+			VDASSERT(iy0 >= mScanYMin && iy1 <= mScanYMax);
+		}
+
+		while(iy0 < iy1) {
+			int ix = (xacc+32767)>>16;
+
+			if (mEdgeBlockIdx >= kEdgeBlockMax) {
+				if (mpFreeEdgeBlocks) {
+					EdgeBlock *newBlock = mpFreeEdgeBlocks;
+					mpFreeEdgeBlocks = mpFreeEdgeBlocks->next;
+					newBlock->next = mpEdgeBlocks;
+					mpEdgeBlocks = newBlock;
+				} else {
+					mpEdgeBlocks = new EdgeBlock(mpEdgeBlocks);
+				}
+
+				mEdgeBlockIdx = 0;
+			}
+
+			Edge& e = mpEdgeBlocks->edges[mEdgeBlockIdx];
+			Scan& s = mpScanBufferBiased[iy0];
+			VDASSERT(iy0 >= mScanYMin && iy0 < mScanYMax);
+			++mEdgeBlockIdx;
+
+			e.posandflag = ix*2+flag;
+			e.next = s.chain;
+			s.chain = &e;
+			++s.count;
+
+			++iy0;
+			xacc += invslope;
+		}
+	}
+}
+
+void VDPixmapPathRasterizer::ScanConvert(VDPixmapRegion& region) {
+	// Convert the edges to spans.  We couldn't do this before because some of
+	// the regions may have winding numbers >+1 and it would have been a pain
+	// to try to adjust the spans on the fly.  We use one heap to detangle
+	// a scanline's worth of edges from the singly-linked lists, and another
+	// to collect the actual scans.
+	vdfastvector<int> heap;
+
+	region.mSpans.clear();
+	int xmin = INT_MAX;
+	int xmax = INT_MIN;
+	int ymin = INT_MAX;
+	int ymax = INT_MIN;
+
+	for(int y=mScanYMin; y<mScanYMax; ++y) {
+		uint32 flipcount = mpScanBufferBiased[y].count;
+
+		if (!flipcount)
+			continue;
+
+		// Keep the edge heap from doing lots of stupid little reallocates.
+		if (heap.capacity() < flipcount)
+			heap.resize((flipcount + 63)&~63);
+
+		// Detangle scanline into edge heap.
+		int *heap0 = heap.data();
+		int *heap1 = heap0;
+		for(const Edge *ptr = mpScanBufferBiased[y].chain; ptr; ptr = ptr->next)
+			*heap1++ = ptr->posandflag;
+
+		VDASSERT(heap1 - heap0 == flipcount);
+
+		// Sort edge heap.  Note that we conveniently made the opening edges
+		// one more than closing edges at the same spot, so we won't have any
+		// problems with abutting spans.
+
+		std::sort(heap0, heap1);
+
+#if 0
+		while(heap0 != heap1) {
+			int x = *heap0++ >> 1;
+			region.mSpans.push_back((y<<16) + x + 0x80008000);
+			region.mSpans.push_back((y<<16) + x + 0x80008001);
+		}
+		continue;
+#endif
+
+		// Trim any odd edges off, since we can never close on one.
+		if (flipcount & 1)
+			--heap1;
+
+		// Process edges and add spans.  Since we only check for a non-zero
+		// winding number, it doesn't matter which way the outlines go. Also, since
+		// the parity always flips after each edge regardless of direction, we can
+		// process the edges in pairs.
+
+		size_t spanstart = region.mSpans.size();
+
+		int x_left;
+		int count = 0;
+		while(heap0 != heap1) {
+			int x = *heap0++;
+
+			if (!count)
+				x_left = (x>>1);
+
+			count += (x&1);
+
+			x = *heap0++;
+
+			count += (x&1);
+
+			if (!--count) {
+				int x_right = (x>>1);
+
+				if (x_right > x_left) {
+					region.mSpans.push_back((y<<16) + x_left  + 0x80008000);
+					region.mSpans.push_back((y<<16) + x_right + 0x80008000);
+
+				}
+			}
+		}
+
+		size_t spanend = region.mSpans.size();
+
+		if (spanend > spanstart) {
+			if (ymin > y)
+				ymin = y;
+
+			if (ymax < y)
+				ymax = y;
+
+			int x1 = (region.mSpans[spanstart] & 0xffff) - 0x8000;
+			int x2 = (region.mSpans[spanend-1] & 0xffff) - 0x8000;
+
+			if (xmin > x1)
+				xmin = x1;
+
+			if (xmax < x2)
+				xmax = x2;
+		}
+	}
+
+	if (xmax > xmin) {
+		region.mBounds.set(xmin, ymin, xmax, ymax);
+	} else {
+		region.mBounds.set(0, 0, 0, 0);
+	}
+
+	// Dump the edge and scan buffers, since we no longer need them.
+	ClearEdgeList();
+	ClearScanBuffer();
+}
+
+void VDPixmapPathRasterizer::ClearEdgeList() {
+	if (mpEdgeBlocks) {
+		EdgeBlock *block = mpEdgeBlocks;
+		
+		while(EdgeBlock *next = block->next)
+			block = next;
+
+		block->next = mpFreeEdgeBlocks;
+		mpFreeEdgeBlocks = mpEdgeBlocks;
+		mpEdgeBlocks = NULL;
+	}
+
+	mEdgeBlockIdx = kEdgeBlockMax;
+}
+
+void VDPixmapPathRasterizer::FreeEdgeLists() {
+	ClearEdgeList();
+
+	while(EdgeBlock *block = mpFreeEdgeBlocks) {
+		mpFreeEdgeBlocks = block->next;
+
+		delete block;
+	}
+}
+
+void VDPixmapPathRasterizer::ClearScanBuffer() {
+	delete[] mpScanBuffer;
+	mpScanBuffer = mpScanBufferBiased = NULL;
+	mScanYMin = 0;
+	mScanYMax = 0;
+}
+
+void VDPixmapPathRasterizer::ReallocateScanBuffer(int ymin, int ymax) {
+	// 
+	// check if there actually is a scan buffer to avoid unintentionally pinning at zero
+	if (mpScanBuffer) {
+		int nicedelta = (mScanYMax - mScanYMin);
+
+		if (ymin < mScanYMin) {
+			int yminnice = mScanYMin - nicedelta;
+			if (ymin > yminnice)
+				ymin = yminnice;
+
+			ymin &= ~31;
+		} else
+			ymin = mScanYMin;
+
+		if (ymax > mScanYMax) {
+			int ymaxnice = mScanYMax + nicedelta;
+			if (ymax < ymaxnice)
+				ymax = ymaxnice;
+
+			ymax = (ymax + 31) & ~31;
+		} else
+			ymax = mScanYMax;
+
+		VDASSERT(ymin <= mScanYMin && ymax >= mScanYMax);
+	}
+
+	// reallocate scan buffer
+	Scan *pNewBuffer = new Scan[ymax - ymin];
+	Scan *pNewBufferBiased = pNewBuffer - ymin;
+
+	if (mpScanBuffer) {
+		memcpy(pNewBufferBiased + mScanYMin, mpScanBufferBiased + mScanYMin, (mScanYMax - mScanYMin) * sizeof(Scan));
+		delete[] mpScanBuffer;
+
+		// zero new areas of scan buffer
+		for(int y=ymin; y<mScanYMin; ++y) {
+			pNewBufferBiased[y].chain = NULL;
+			pNewBufferBiased[y].count = 0;
+		}
+
+		for(int y=mScanYMax; y<ymax; ++y) {
+			pNewBufferBiased[y].chain = NULL;
+			pNewBufferBiased[y].count = 0;
+		}
+	} else {
+		for(int y=ymin; y<ymax; ++y) {
+			pNewBufferBiased[y].chain = NULL;
+			pNewBufferBiased[y].count = 0;
+		}
+	}
+
+	mpScanBuffer = pNewBuffer;
+	mpScanBufferBiased = pNewBufferBiased;
+	mScanYMin = ymin;
+	mScanYMax = ymax;
+}
+
+bool VDPixmapFillRegion(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+	if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+		return false;
+
+	// fast out
+	if (region.mSpans.empty())
+		return true;
+
+	// check if vertical clipping is required
+	const size_t n = region.mSpans.size();
+	uint32 start = 0;
+	uint32 end = n;
+
+	uint32 spanmin = (-x) + ((-y) << 16) + 0x80008000;
+
+	if (region.mSpans.front() < spanmin) {
+		uint32 lo = 0, hi = n;
+
+		// compute top clip
+		while(lo < hi) {
+			int mid = ((lo + hi) >> 1) & ~1;
+
+			if (region.mSpans[mid + 1] < spanmin)
+				lo = mid + 2;
+			else
+				hi = mid;
+		}
+
+		start = lo;
+
+		// check for total top clip
+		if (start >= n)
+			return true;
+	}
+
+	uint32 spanlimit = (dst.w - x) + ((dst.h - y - 1) << 16) + 0x80008000;
+
+	if (region.mSpans.back() > spanlimit) {
+		// compute bottom clip
+		int lo = start;
+		int hi = n;
+
+		while(lo < hi) {
+			int mid = ((lo + hi) >> 1) & ~1;
+
+			if (region.mSpans[mid] >= spanlimit)
+				hi = mid;
+			else
+				lo = mid+2;
+		}
+
+		end = lo;
+
+		// check for total bottom clip
+		if (start >= end)
+			return true;
+	}
+
+	// fill region
+	const uint32 *pSpan = &region.mSpans[start];
+	const uint32 *pEnd  = &region.mSpans[0] + end;
+	int lasty = -1;
+	uint32 *dstp;
+
+	for(; pSpan != pEnd; pSpan += 2) {
+		uint32 span0 = pSpan[0];
+		uint32 span1 = pSpan[1];
+
+		uint32 py = (span0 >> 16) - 0x8000 + y;
+		uint32 px = (span0 & 0xffff) - 0x8000 + x;
+		uint32 w = span1-span0;
+
+		VDASSERT(py < (uint32)dst.h);
+		VDASSERT(px < (uint32)dst.w);
+		VDASSERT(dst.w - (int)px >= (int)w);
+
+		if (lasty != py)
+			dstp = (uint32 *)vdptroffset(dst.data, dst.pitch * py);
+
+		uint32 *p = dstp + px;
+		do {
+			*p++ = color;
+		} while(--w);
+	}
+
+	return true;
+}
+
+namespace {
+	void RenderABuffer32(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+		if (!w)
+			return;
+
+		// update dest pointer
+		uint32 *dstp = (uint32 *)vdptroffset(dst.data, dst.pitch * y);
+
+		const uint32 color_rb = color & 0x00FF00FF;
+		const uint32 color_g  = color & 0x0000FF00;
+		do {
+			const uint32 px = *dstp;
+			const uint32 px_rb = px & 0x00FF00FF;
+			const uint32 px_g  = px & 0x0000FF00;
+			const sint32 a     = *alpha++;
+
+			const uint32 result_rb = (((px_rb << 6) + ((sint32)(color_rb - px_rb)*a + 0x00200020)) & 0x3FC03FC0);
+			const uint32 result_g  = (((px_g  << 6) + ((sint32)(color_g  - px_g )*a + 0x00002000)) & 0x003FC000);
+
+			*dstp++ = (result_rb + result_g) >> 6;
+		} while(--w);
+	}
+
+	void RenderABuffer8(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+		if (!w)
+			return;
+
+		// update dest pointer
+		uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+		do {
+			const uint8 px = *dstp;
+			const sint8 a = *alpha++;
+
+			*dstp++ = px + (((sint32)(color - px) * a + 32) >> 6);
+		} while(--w);
+	}
+
+	void RenderABuffer8_128(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+		if (!w)
+			return;
+
+		// update dest pointer
+		uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+		do {
+			const uint8 px = *dstp;
+			const sint16 a = *alpha++;
+
+			*dstp++ = px + (((sint32)(color - px) * a + 64) >> 7);
+		} while(--w);
+	}
+
+	void RenderABuffer8_256(const VDPixmap& dst, int y, const uint16 *alpha, uint32 w, uint32 color) {
+		if (!w)
+			return;
+
+		// update dest pointer
+		uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+		do {
+			const uint8 px = *dstp;
+			const sint32 a = *alpha++;
+
+			*dstp++ = px + (((sint32)(color - px) * a + 128) >> 8);
+		} while(--w);
+	}
+
+	void RenderABuffer8_1024(const VDPixmap& dst, int y, const uint16 *alpha, uint32 w, uint32 color) {
+		if (!w)
+			return;
+
+		// update dest pointer
+		uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+		do {
+			const uint8 px = *dstp;
+			const sint32 a = *alpha++;
+
+			*dstp++ = px + (((sint32)(color - px) * a + 512) >> 10);
+		} while(--w);
+	}
+}
+
+bool VDPixmapFillRegionAntialiased_32x_32x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+	if (dst.format != nsVDPixmap::kPixFormat_Y8)
+		return false;
+
+	// fast out
+	if (region.mSpans.empty())
+		return true;
+
+	// check if vertical clipping is required
+	const size_t n = region.mSpans.size();
+	uint32 start = 0;
+	uint32 end = n;
+
+	uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+	if (region.mSpans.front() < spanmin) {
+		// find first span : x2 > spanmin
+		start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+		start &= ~1;
+
+		// check for total top clip
+		if (start >= n)
+			return true;
+	}
+
+	uint32 spanlimit = (dst.w*32 - x) + (((dst.h*32 - y) - 1) << 16) + 0x80008000;
+
+	if (region.mSpans.back() > spanlimit) {
+		// find last span : x1 < spanlimit
+		end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+		end = (end + 1) & ~1;
+
+		// check for total bottom clip
+		if (start >= end)
+			return true;
+	}
+
+	// allocate A-buffer
+	vdfastvector<uint16> abuffer(dst.w, 0);
+
+	// fill region
+	const uint32 *pSpan = &region.mSpans[start];
+	const uint32 *pEnd  = &region.mSpans[0] + end;
+	int lasty = -1;
+
+	sint32 dstw32 = dst.w * 32;
+	sint32 dsth32 = dst.h * 32;
+
+	for(; pSpan != pEnd; pSpan += 2) {
+		uint32 span0 = pSpan[0];
+		uint32 span1 = pSpan[1];
+
+		sint32 py = (span0 >> 16) - 0x8000 + y;
+
+		if ((uint32)py >= (uint32)dsth32)
+			continue;
+
+		sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+		sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+		sint32 w = span1-span0;
+
+		if (lasty != py) {
+			if (((lasty ^ py) & 0xFFFFFFE0)) {
+				if (lasty >= 0) {
+					// flush scanline
+
+					RenderABuffer8_1024(dst, lasty >> 5, abuffer.data(), dst.w, color);
+				}
+
+				memset(abuffer.data(), 0, abuffer.size() * sizeof(abuffer[0]));
+			}
+			lasty = py;
+		}
+
+		if (px1 < 0)
+			px1 = 0;
+		if (px2 > dstw32)
+			px2 = dstw32;
+
+		if (px1 >= px2)
+			continue;
+
+		uint32 ix1 = px1 >> 5;
+		uint32 ix2 = px2 >> 5;
+		uint16 *p1 = abuffer.data() + ix1;
+		uint16 *p2 = abuffer.data() + ix2;
+
+		if (p1 == p2) {
+			p1[0] += (px2 - px1);
+		} else {
+			if (px1 & 31) {
+				p1[0] += 32 - (px1 & 31);
+				++p1;
+			}
+
+			while(p1 != p2) {
+				p1[0] += 32;
+				++p1;
+			}
+
+			if (px2 & 31)
+				p1[0] += px2 & 32;
+		}
+	}
+
+	if (lasty >= 0)
+		RenderABuffer8_1024(dst, lasty >> 5, abuffer.data(), dst.w, color);
+
+	return true;
+}
+
+bool VDPixmapFillRegionAntialiased_16x_16x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+	if (dst.format != nsVDPixmap::kPixFormat_Y8)
+		return false;
+
+	// fast out
+	if (region.mSpans.empty())
+		return true;
+
+	// check if vertical clipping is required
+	const size_t n = region.mSpans.size();
+	uint32 start = 0;
+	uint32 end = n;
+
+	uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+	if (region.mSpans.front() < spanmin) {
+		// find first span : x2 > spanmin
+		start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+		start &= ~1;
+
+		// check for total top clip
+		if (start >= n)
+			return true;
+	}
+
+	uint32 spanlimit = (dst.w*16 - x) + (((dst.h*16 - y) - 1) << 16) + 0x80008000;
+
+	if (region.mSpans.back() > spanlimit) {
+		// find last span : x1 < spanlimit
+		end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+		end = (end + 1) & ~1;
+
+		// check for total bottom clip
+		if (start >= end)
+			return true;
+	}
+
+	// allocate A-buffer
+	vdfastvector<uint16> abuffer(dst.w, 0);
+
+	// fill region
+	const uint32 *pSpan = &region.mSpans[start];
+	const uint32 *pEnd  = &region.mSpans[0] + end;
+	int lasty = -1;
+
+	sint32 dstw16 = dst.w * 16;
+	sint32 dsth16 = dst.h * 16;
+
+	for(; pSpan != pEnd; pSpan += 2) {
+		uint32 span0 = pSpan[0];
+		uint32 span1 = pSpan[1];
+
+		sint32 py = (span0 >> 16) - 0x8000 + y;
+
+		if ((uint32)py >= (uint32)dsth16)
+			continue;
+
+		sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+		sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+		sint32 w = span1-span0;
+
+		if (lasty != py) {
+			if (((lasty ^ py) & 0xFFFFFFF0)) {
+				if (lasty >= 0) {
+					// flush scanline
+
+					RenderABuffer8_256(dst, lasty >> 4, abuffer.data(), dst.w, color);
+				}
+
+				memset(abuffer.data(), 0, abuffer.size() * sizeof(abuffer[0]));
+			}
+			lasty = py;
+		}
+
+		if (px1 < 0)
+			px1 = 0;
+		if (px2 > dstw16)
+			px2 = dstw16;
+
+		if (px1 >= px2)
+			continue;
+
+		uint32 ix1 = px1 >> 4;
+		uint32 ix2 = px2 >> 4;
+		uint16 *p1 = abuffer.data() + ix1;
+		uint16 *p2 = abuffer.data() + ix2;
+
+		if (p1 == p2) {
+			p1[0] += (px2 - px1);
+		} else {
+			if (px1 & 15) {
+				p1[0] += 16 - (px1 & 15);
+				++p1;
+			}
+
+			while(p1 != p2) {
+				p1[0] += 16;
+				++p1;
+			}
+
+			if (px2 & 15)
+				p1[0] += px2 & 15;
+		}
+	}
+
+	if (lasty >= 0)
+		RenderABuffer8_256(dst, lasty >> 4, abuffer.data(), dst.w, color);
+
+	return true;
+}
+
+bool VDPixmapFillRegionAntialiased_16x_8x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+	if (dst.format != nsVDPixmap::kPixFormat_XRGB8888 && dst.format != nsVDPixmap::kPixFormat_Y8)
+		return false;
+
+	// fast out
+	if (region.mSpans.empty())
+		return true;
+
+	// check if vertical clipping is required
+	const size_t n = region.mSpans.size();
+	uint32 start = 0;
+	uint32 end = n;
+
+	uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+	if (region.mSpans.front() < spanmin) {
+		// find first span : x2 > spanmin
+		start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+		start &= ~1;
+
+		// check for total top clip
+		if (start >= n)
+			return true;
+	}
+
+	uint32 spanlimit = (dst.w*16 - x) + (((dst.h*8 - y) - 1) << 16) + 0x80008000;
+
+	if (region.mSpans.back() > spanlimit) {
+		// find last span : x1 < spanlimit
+		end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+		end = (end + 1) & ~1;
+
+		// check for total bottom clip
+		if (start >= end)
+			return true;
+	}
+
+	// allocate A-buffer
+	vdfastvector<uint8> abuffer(dst.w, 0);
+
+	// fill region
+	const uint32 *pSpan = &region.mSpans[start];
+	const uint32 *pEnd  = &region.mSpans[0] + end;
+	int lasty = -1;
+
+	sint32 dstw16 = dst.w * 16;
+	sint32 dsth8 = dst.h * 8;
+
+	for(; pSpan != pEnd; pSpan += 2) {
+		uint32 span0 = pSpan[0];
+		uint32 span1 = pSpan[1];
+
+		sint32 py = (span0 >> 16) - 0x8000 + y;
+
+		if ((uint32)py >= (uint32)dsth8)
+			continue;
+
+		sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+		sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+		sint32 w = span1-span0;
+
+		if (lasty != py) {
+			if (((lasty ^ py) & 0xFFFFFFF8)) {
+				if (lasty >= 0) {
+					// flush scanline
+
+					RenderABuffer8_128(dst, lasty >> 3, abuffer.data(), dst.w, color);
+				}
+
+				memset(abuffer.data(), 0, abuffer.size());
+			}
+			lasty = py;
+		}
+
+		if (px1 < 0)
+			px1 = 0;
+		if (px2 > dstw16)
+			px2 = dstw16;
+
+		if (px1 >= px2)
+			continue;
+
+		uint32 ix1 = px1 >> 4;
+		uint32 ix2 = px2 >> 4;
+		uint8 *p1 = abuffer.data() + ix1;
+		uint8 *p2 = abuffer.data() + ix2;
+
+		if (p1 == p2) {
+			p1[0] += (px2 - px1);
+		} else {
+			if (px1 & 15) {
+				p1[0] += 16 - (px1 & 15);
+				++p1;
+			}
+
+			while(p1 != p2) {
+				p1[0] += 16;
+				++p1;
+			}
+
+			if (px2 & 15)
+				p1[0] += px2 & 15;
+		}
+	}
+
+	if (lasty >= 0)
+		RenderABuffer8_128(dst, lasty >> 3, abuffer.data(), dst.w, color);
+
+	return true;
+}
+
+bool VDPixmapFillRegionAntialiased8x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+	if (dst.format == nsVDPixmap::kPixFormat_YUV444_Planar ||
+		dst.format == nsVDPixmap::kPixFormat_YUV422_Planar ||
+		dst.format == nsVDPixmap::kPixFormat_YUV420_Planar ||
+		dst.format == nsVDPixmap::kPixFormat_YUV410_Planar) {
+		VDPixmap pxY;
+		VDPixmap pxCb;
+		VDPixmap pxCr;
+
+		pxY.format = nsVDPixmap::kPixFormat_Y8;
+		pxY.data = dst.data;
+		pxY.pitch = dst.pitch;
+		pxY.w = dst.w;
+		pxY.h = dst.h;
+
+		pxCb.format = nsVDPixmap::kPixFormat_Y8;
+		pxCb.data = dst.data2;
+		pxCb.pitch = dst.pitch2;
+		pxCb.w = dst.w;
+		pxCb.h = dst.h;
+
+		pxCr.format = nsVDPixmap::kPixFormat_Y8;
+		pxCr.data = dst.data3;
+		pxCr.pitch = dst.pitch3;
+		pxCr.w = dst.w;
+		pxCr.h = dst.h;
+
+		uint32 colorY = (color >> 8) & 0xff;
+		uint32 colorCb = (color >> 0) & 0xff;
+		uint32 colorCr = (color >> 16) & 0xff;
+
+		VDPixmapFillRegionAntialiased8x(pxY, region, x, y, colorY);
+
+		switch(dst.format) {
+		case nsVDPixmap::kPixFormat_YUV410_Planar:
+			pxCr.w = pxCb.w = dst.w >> 2;
+			pxCr.h = pxCb.h = dst.h >> 2;
+			x >>= 2;
+			y >>= 2;
+			VDPixmapFillRegionAntialiased_32x_32x(pxCb, region, x, y, colorCb);
+			VDPixmapFillRegionAntialiased_32x_32x(pxCr, region, x, y, colorCr);
+			return true;
+		case nsVDPixmap::kPixFormat_YUV420_Planar:
+			pxCr.w = pxCb.w = dst.w >> 1;
+			pxCr.h = pxCb.h = dst.h >> 1;
+			x >>= 1;
+			y >>= 1;
+			x += 2;
+			VDPixmapFillRegionAntialiased_16x_16x(pxCb, region, x, y, colorCb);
+			VDPixmapFillRegionAntialiased_16x_16x(pxCr, region, x, y, colorCr);
+			return true;
+		case nsVDPixmap::kPixFormat_YUV422_Planar:
+			pxCr.w = pxCb.w = dst.w >> 1;
+			x >>= 1;
+			x += 2;
+			VDPixmapFillRegionAntialiased_16x_8x(pxCb, region, x, y, colorCb);
+			VDPixmapFillRegionAntialiased_16x_8x(pxCr, region, x, y, colorCr);
+			return true;
+		case nsVDPixmap::kPixFormat_YUV444_Planar:
+			VDPixmapFillRegionAntialiased8x(pxCb, region, x, y, colorCb);
+			VDPixmapFillRegionAntialiased8x(pxCr, region, x, y, colorCr);
+			return true;
+		}
+	}
+
+	if (dst.format != nsVDPixmap::kPixFormat_XRGB8888 && dst.format != nsVDPixmap::kPixFormat_Y8)
+		return false;
+
+	// fast out
+	if (region.mSpans.empty())
+		return true;
+
+	// check if vertical clipping is required
+	const size_t n = region.mSpans.size();
+	uint32 start = 0;
+	uint32 end = n;
+
+	uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+	if (region.mSpans.front() < spanmin) {
+		// find first span : x2 > spanmin
+		start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+		start &= ~1;
+
+		// check for total top clip
+		if (start >= n)
+			return true;
+	}
+
+	uint32 spanlimit = (dst.w*8 - x) + (((dst.h*8 - y) - 1) << 16) + 0x80008000;
+
+	if (region.mSpans.back() > spanlimit) {
+		// find last span : x1 < spanlimit
+		end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+		end = (end + 1) & ~1;
+
+		// check for total bottom clip
+		if (start >= end)
+			return true;
+	}
+
+	// allocate A-buffer
+	vdfastvector<uint8> abuffer(dst.w, 0);
+
+	// fill region
+	const uint32 *pSpan = &region.mSpans[start];
+	const uint32 *pEnd  = &region.mSpans[0] + end;
+	int lasty = -1;
+
+	sint32 dstw8 = dst.w * 8;
+	sint32 dsth8 = dst.h * 8;
+
+	for(; pSpan != pEnd; pSpan += 2) {
+		uint32 span0 = pSpan[0];
+		uint32 span1 = pSpan[1];
+
+		sint32 py = (span0 >> 16) - 0x8000 + y;
+
+		if ((uint32)py >= (uint32)dsth8)
+			continue;
+
+		sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+		sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+		sint32 w = span1-span0;
+
+		if (lasty != py) {
+			if (((lasty ^ py) & 0xFFFFFFF8)) {
+				if (lasty >= 0) {
+					// flush scanline
+
+					if (dst.format == nsVDPixmap::kPixFormat_XRGB8888)
+						RenderABuffer32(dst, lasty >> 3, abuffer.data(), dst.w, color);
+					else
+						RenderABuffer8(dst, lasty >> 3, abuffer.data(), dst.w, color);
+				}
+
+				memset(abuffer.data(), 0, abuffer.size());
+			}
+			lasty = py;
+		}
+
+		if (px1 < 0)
+			px1 = 0;
+		if (px2 > dstw8)
+			px2 = dstw8;
+
+		if (px1 >= px2)
+			continue;
+
+		uint32 ix1 = px1 >> 3;
+		uint32 ix2 = px2 >> 3;
+		uint8 *p1 = abuffer.data() + ix1;
+		uint8 *p2 = abuffer.data() + ix2;
+
+		if (p1 == p2) {
+			p1[0] += (px2 - px1);
+		} else {
+			if (px1 & 7) {
+				p1[0] += 8 - (px1 & 7);
+				++p1;
+			}
+
+			while(p1 != p2) {
+				p1[0] += 8;
+				++p1;
+			}
+
+			if (px2 & 7)
+				p1[0] += px2 & 7;
+		}
+	}
+
+	if (lasty >= 0) {
+		if (dst.format == nsVDPixmap::kPixFormat_XRGB8888)
+			RenderABuffer32(dst, lasty >> 3, abuffer.data(), dst.w, color);
+		else
+			RenderABuffer8(dst, lasty >> 3, abuffer.data(), dst.w, color);
+	}
+
+	return true;
+}
+
+void VDPixmapCreateRoundRegion(VDPixmapRegion& dst, float r) {
+	int ir = VDCeilToInt(r);
+	float r2 = r*r;
+
+	dst.mSpans.clear();
+	dst.mBounds.set(-ir, 0, ir+1, 0);
+
+	for(int y = -ir; y <= ir; ++y) {
+		int dx = VDCeilToInt(sqrtf(r2 - y*y));
+
+		if (dx > 0) {
+			dst.mSpans.push_back(0x80008000 + (y << 16) - dx);
+			dst.mSpans.push_back(0x80008001 + (y << 16) + dx);
+			if (dst.mBounds.top > y)
+				dst.mBounds.top = y;
+			if (dst.mBounds.bottom < y)
+				dst.mBounds.bottom = y;
+		}
+	}
+}
+
+void VDPixmapConvolveRegion(VDPixmapRegion& dst, const VDPixmapRegion& r1, const VDPixmapRegion& r2, int dx1, int dx2, int dy) {
+	dst.mSpans.clear();
+	dst.mSpans.resize(r1.mSpans.size()+r2.mSpans.size());
+
+	const uint32 *itA	= r1.mSpans.data();
+	const uint32 *itAE	= itA + r1.mSpans.size();
+	const uint32 *itB	= r2.mSpans.data();
+	const uint32 *itBE	= itB + r2.mSpans.size();
+	uint32 *dstp0 = dst.mSpans.data();
+	uint32 *dstp = dst.mSpans.data();
+
+	uint32 offset1 = (dy<<16) + dx1;
+	uint32 offset2 = (dy<<16) + dx2;
+
+	while(itA != itAE && itB != itBE) {
+		uint32 x1;
+		uint32 x2;
+
+		if (itB[0] + offset1 < itA[0]) {
+			// B span is earlier.  Use it.
+			x1 = itB[0] + offset1;
+			x2 = itB[1] + offset2;
+			itB += 2;
+
+			// B spans *can* overlap, due to the widening.
+			while(itB != itBE && itB[0]+offset1 <= x2) {
+				uint32 bx2 = itB[1] + offset2;
+				if (x2 < bx2)
+					x2 = bx2;
+
+				itB += 2;
+			}
+
+			goto a_start;
+		} else {
+			// A span is earlier.  Use it.
+			x1 = itA[0];
+			x2 = itA[1];
+			itA += 2;
+
+			// A spans don't overlap, so begin merge loop with B first.
+		}
+
+		for(;;) {
+			// If we run out of B spans or the B span doesn't overlap,
+			// then the next A span can't either (because A spans don't
+			// overlap) and we exit.
+
+			if (itB == itBE || itB[0]+offset1 > x2)
+				break;
+
+			do {
+				uint32 bx2 = itB[1] + offset2;
+				if (x2 < bx2)
+					x2 = bx2;
+
+				itB += 2;
+			} while(itB != itBE && itB[0]+offset1 <= x2);
+
+			// If we run out of A spans or the A span doesn't overlap,
+			// then the next B span can't either, because we would have
+			// consumed all overlapping B spans in the above loop.
+a_start:
+			if (itA == itAE || itA[0] > x2)
+				break;
+
+			do {
+				uint32 ax2 = itA[1];
+				if (x2 < ax2)
+					x2 = ax2;
+
+				itA += 2;
+			} while(itA != itAE && itA[0] <= x2);
+		}
+
+		// Flush span.
+		dstp[0] = x1;
+		dstp[1] = x2;
+		dstp += 2;
+	}
+
+	// Copy over leftover spans.
+	memcpy(dstp, itA, sizeof(uint32)*(itAE - itA));
+	dstp += itAE - itA;
+
+	while(itB != itBE) {
+		// B span is earlier.  Use it.
+		uint32 x1 = itB[0] + offset1;
+		uint32 x2 = itB[1] + offset2;
+		itB += 2;
+
+		// B spans *can* overlap, due to the widening.
+		while(itB != itBE && itB[0]+offset1 <= x2) {
+			uint32 bx2 = itB[1] + offset2;
+			if (x2 < bx2)
+				x2 = bx2;
+
+			itB += 2;
+		}
+
+		dstp[0] = x1;
+		dstp[1] = x2;
+		dstp += 2;
+	}
+
+	dst.mSpans.resize(dstp - dst.mSpans.data());
+}
+
+void VDPixmapConvolveRegion(VDPixmapRegion& dst, const VDPixmapRegion& r1, const VDPixmapRegion& r2) {
+	VDPixmapRegion temp;
+
+	const uint32 *src1 = r2.mSpans.data();
+	const uint32 *src2 = src1 + r2.mSpans.size();
+
+	dst.mSpans.clear();
+	while(src1 != src2) {
+		uint32 p1 = src1[0];
+		uint32 p2 = src1[1];
+		src1 += 2;
+
+		temp.mSpans.swap(dst.mSpans);
+		VDPixmapConvolveRegion(dst, temp, r1, (p1 & 0xffff) - 0x8000, (p2 & 0xffff) - 0x8000, (p1 >> 16) - 0x8000);
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp
new file mode 100644
index 000000000..4d1aef5f5
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp
@@ -0,0 +1,348 @@
+//	VirtualDub - Video processing and capture application
+//	Graphics support library
+//	Copyright (C) 1998-2004 Avery Lee
+//
+//	This program is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program; if not, write to the Free Software
+//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <float.h>
+#include <math.h>
+#include <vd2/system/vdalloc.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/resample.h>
+#include "uberblit_gen.h"
+
+///////////////////////////////////////////////////////////////////////////
+//
+// the resampler (finally)
+//
+///////////////////////////////////////////////////////////////////////////
+
+class VDPixmapResampler : public IVDPixmapResampler {
+public:
+	VDPixmapResampler();
+	~VDPixmapResampler();
+
+	void SetSplineFactor(double A) { mSplineFactor = A; }
+	void SetFilters(FilterMode h, FilterMode v, bool interpolationOnly);
+	bool Init(uint32 dw, uint32 dh, int dstformat, uint32 sw, uint32 sh, int srcformat);
+	bool Init(const vdrect32f& dstrect, uint32 dw, uint32 dh, int dstformat, const vdrect32f& srcrect, uint32 sw, uint32 sh, int srcformat);
+	void Shutdown();
+
+	void Process(const VDPixmap& dst, const VDPixmap& src);
+
+protected:
+	void ApplyFilters(VDPixmapUberBlitterGenerator& gen, uint32 dw, uint32 dh, float xoffset, float yoffset, float xfactor, float yfactor);
+
+	vdautoptr<IVDPixmapBlitter> mpBlitter;
+	vdautoptr<IVDPixmapBlitter> mpBlitter2;
+	double				mSplineFactor;
+	FilterMode			mFilterH;
+	FilterMode			mFilterV;
+	bool				mbInterpOnly;
+
+	vdrect32			mDstRectPlane0;
+	vdrect32			mDstRectPlane12;
+};
+
+IVDPixmapResampler *VDCreatePixmapResampler() { return new VDPixmapResampler; }
+
+VDPixmapResampler::VDPixmapResampler()
+	: mSplineFactor(-0.6)
+	, mFilterH(kFilterCubic)
+	, mFilterV(kFilterCubic)
+	, mbInterpOnly(false)
+{
+}
+
+VDPixmapResampler::~VDPixmapResampler() {
+	Shutdown();
+}
+
+void VDPixmapResampler::SetFilters(FilterMode h, FilterMode v, bool interpolationOnly) {
+	mFilterH = h;
+	mFilterV = v;
+	mbInterpOnly = interpolationOnly;
+}
+
+bool VDPixmapResampler::Init(uint32 dw, uint32 dh, int dstformat, uint32 sw, uint32 sh, int srcformat) {
+	vdrect32f rSrc(0.0f, 0.0f, (float)sw, (float)sh);
+	vdrect32f rDst(0.0f, 0.0f, (float)dw, (float)dh);
+	return Init(rDst, dw, dh, dstformat, rSrc, sw, sh, srcformat);
+}
+
+bool VDPixmapResampler::Init(const vdrect32f& dstrect0, uint32 dw, uint32 dh, int dstformat, const vdrect32f& srcrect0, uint32 sw, uint32 sh, int srcformat) {
+	Shutdown();
+
+	if (dstformat != srcformat || (
+			srcformat != nsVDPixmap::kPixFormat_XRGB8888 &&
+			srcformat != nsVDPixmap::kPixFormat_Y8 &&
+			srcformat != nsVDPixmap::kPixFormat_YUV444_Planar &&
+			srcformat != nsVDPixmap::kPixFormat_YUV422_Planar &&
+			srcformat != nsVDPixmap::kPixFormat_YUV420_Planar &&
+			srcformat != nsVDPixmap::kPixFormat_YUV411_Planar &&
+			srcformat != nsVDPixmap::kPixFormat_YUV410_Planar
+			))
+		return false;
+
+	// convert destination flips to source flips
+	vdrect32f dstrect(dstrect0);
+	vdrect32f srcrect(srcrect0);
+
+	if (dstrect.left > dstrect.right) {
+		std::swap(dstrect.left, dstrect.right);
+		std::swap(srcrect.left, srcrect.right);
+	}
+
+	if (dstrect.top > dstrect.bottom) {
+		std::swap(dstrect.top, dstrect.bottom);
+		std::swap(srcrect.top, srcrect.bottom);
+	}
+
+	// compute source step factors
+	float xfactor = (float)srcrect.width()  / (float)dstrect.width();
+	float yfactor = (float)srcrect.height() / (float)dstrect.height();
+
+	// clip destination rect
+	if (dstrect.left < 0) {
+		float clipx1 = -dstrect.left;
+		srcrect.left += xfactor * clipx1;
+		dstrect.left = 0.0f;
+	}
+
+	if (dstrect.top < 0) {
+		float clipy1 = -dstrect.top;
+		srcrect.top += yfactor * clipy1;
+		dstrect.top = 0.0f;
+	}
+
+	float clipx2 = dstrect.right - (float)dw;
+	if (clipx2 > 0) {
+		srcrect.right -= xfactor * clipx2;
+		dstrect.right = (float)dw;
+	}
+
+	float clipy2 = dstrect.bottom - (float)dh;
+	if (clipy2 > 0) {
+		srcrect.bottom -= yfactor * clipy2;
+		dstrect.bottom = (float)dh;
+	}
+
+	// compute plane 0 dest rect in integral quanta
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dstformat);
+	mDstRectPlane0.left		= VDCeilToInt(dstrect.left	 - 0.5f);
+	mDstRectPlane0.top		= VDCeilToInt(dstrect.top	 - 0.5f);
+	mDstRectPlane0.right	= VDCeilToInt(dstrect.right	 - 0.5f);
+	mDstRectPlane0.bottom	= VDCeilToInt(dstrect.bottom - 0.5f);
+
+	// compute plane 0 stepping parameters
+	float xoffset = (((float)mDstRectPlane0.left + 0.5f) - dstrect.left) * xfactor + srcrect.left;
+	float yoffset = (((float)mDstRectPlane0.top  + 0.5f) - dstrect.top ) * yfactor + srcrect.top;
+
+	// compute plane 1/2 dest rect and stepping parameters
+	float xoffset2 = 0.0f;
+	float yoffset2 = 0.0f;
+
+	if (formatInfo.auxbufs > 0) {
+		float xf2 = (float)(1 << formatInfo.auxwbits);
+		float yf2 = (float)(1 << formatInfo.auxhbits);
+		float invxf2 = 1.0f / xf2;
+		float invyf2 = 1.0f / yf2;
+
+		// convert source and dest rects to plane 1/2 space
+		vdrect32f srcrect2(srcrect);
+		vdrect32f dstrect2(dstrect);
+
+		srcrect2.scale(invxf2, invyf2);
+		dstrect2.scale(invxf2, invyf2);
+
+		switch(srcformat) {
+		case nsVDPixmap::kPixFormat_YUV444_Planar:
+			break;
+		case nsVDPixmap::kPixFormat_YUV422_Planar:
+			srcrect2.translate(0.25f, 0.0f);
+			dstrect2.translate(0.25f, 0.0f);
+			break;
+		case nsVDPixmap::kPixFormat_YUV420_Planar:
+			srcrect2.translate(0.25f, 0.0f);
+			dstrect2.translate(0.25f, 0.0f);
+			break;
+		case nsVDPixmap::kPixFormat_YUV411_Planar:
+			srcrect2.translate(0.375f, 0.0f);
+			dstrect2.translate(0.375f, 0.0f);
+			break;
+		case nsVDPixmap::kPixFormat_YUV410_Planar:
+			break;
+		default:
+			VDASSERT(false);
+		}
+
+		mDstRectPlane12.left	= VDCeilToInt(dstrect2.left		- 0.5f);
+		mDstRectPlane12.top		= VDCeilToInt(dstrect2.top		- 0.5f);
+		mDstRectPlane12.right	= VDCeilToInt(dstrect2.right	- 0.5f);
+		mDstRectPlane12.bottom	= VDCeilToInt(dstrect2.bottom	- 0.5f);
+
+		xoffset2 = (((float)mDstRectPlane12.left + 0.5f) - dstrect2.left) * xfactor + srcrect2.left;
+		yoffset2 = (((float)mDstRectPlane12.top  + 0.5f) - dstrect2.top ) * yfactor + srcrect2.top;
+	}
+
+	VDPixmapUberBlitterGenerator gen;
+
+	switch(srcformat) {
+		case nsVDPixmap::kPixFormat_XRGB8888:
+			gen.ldsrc(0, 0, 0, 0, sw, sh, VDPixmapGetFormatTokenFromFormat(srcformat), sw*4);
+			ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+			break;
+
+		case nsVDPixmap::kPixFormat_Y8:
+			gen.ldsrc(0, 0, 0, 0, sw, sh, kVDPixType_8, sw);
+			ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+			break;
+
+		case nsVDPixmap::kPixFormat_YUV444_Planar:
+		case nsVDPixmap::kPixFormat_YUV422_Planar:
+		case nsVDPixmap::kPixFormat_YUV420_Planar:
+		case nsVDPixmap::kPixFormat_YUV411_Planar:
+		case nsVDPixmap::kPixFormat_YUV410_Planar:
+			gen.ldsrc(0, 0, 0, 0, sw, sh, kVDPixType_8, sw);
+			ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+
+			{
+				const VDPixmapFormatInfo& info = VDPixmapGetInfo(dstformat);
+				uint32 subsw = -(-(sint32)sw >> info.auxwbits);
+				uint32 subsh = -(-(sint32)sh >> info.auxhbits);
+
+				VDPixmapUberBlitterGenerator gen2;
+				gen2.ldsrc(0, 0, 0, 0, subsw, subsh, kVDPixType_8, subsw);
+				ApplyFilters(gen2, mDstRectPlane12.width(), mDstRectPlane12.height(), xoffset2, yoffset2, xfactor, yfactor);
+				mpBlitter2 = gen2.create();
+				if (!mpBlitter2)
+					return false;
+			}
+			break;
+	}
+
+	mpBlitter = gen.create();
+	if (!mpBlitter)
+		return false;
+
+	return true;
+}
+
+void VDPixmapResampler::Shutdown() {
+	mpBlitter = NULL;
+	mpBlitter2 = NULL;
+}
+
+void VDPixmapResampler::Process(const VDPixmap& dst, const VDPixmap& src) {
+	if (!mpBlitter)
+		return;
+
+	switch(dst.format) {
+		case nsVDPixmap::kPixFormat_XRGB8888:
+		case nsVDPixmap::kPixFormat_Y8:
+			mpBlitter->Blit(dst, &mDstRectPlane0, src);
+			break;
+
+		case nsVDPixmap::kPixFormat_YUV444_Planar:
+		case nsVDPixmap::kPixFormat_YUV422_Planar:
+		case nsVDPixmap::kPixFormat_YUV420_Planar:
+		case nsVDPixmap::kPixFormat_YUV411_Planar:
+		case nsVDPixmap::kPixFormat_YUV410_Planar:
+			// blit primary plane
+			mpBlitter->Blit(dst, &mDstRectPlane0, src);
+
+			// slice and blit secondary planes
+			{
+				const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+				VDPixmap pxdst;
+				pxdst.format	= nsVDPixmap::kPixFormat_Y8;
+				pxdst.w			= -(-dst.w >> formatInfo.auxwbits);
+				pxdst.h			= -(-dst.h >> formatInfo.auxhbits);
+				pxdst.pitch		= dst.pitch2;
+				pxdst.data		= dst.data2;
+
+				VDPixmap pxsrc;
+				pxsrc.format	= nsVDPixmap::kPixFormat_Y8;
+				pxsrc.w			= -(-src.w >> formatInfo.auxwbits);
+				pxsrc.h			= -(-src.h >> formatInfo.auxhbits);
+				pxsrc.pitch		= src.pitch2;
+				pxsrc.data		= src.data2;
+
+				mpBlitter2->Blit(pxdst, &mDstRectPlane12, pxsrc);
+
+				pxdst.pitch		= dst.pitch3;
+				pxdst.data		= dst.data3;
+				pxsrc.pitch		= src.pitch3;
+				pxsrc.data		= src.data3;
+				mpBlitter2->Blit(pxdst, &mDstRectPlane12, pxsrc);
+			}
+			break;
+	}
+}
+
+void VDPixmapResampler::ApplyFilters(VDPixmapUberBlitterGenerator& gen, uint32 dw, uint32 dh, float xoffset, float yoffset, float xfactor, float yfactor) {
+	switch(mFilterH) {
+		case kFilterPoint:
+			gen.pointh(xoffset, xfactor, dw);
+			break;
+
+		case kFilterLinear:
+			gen.linearh(xoffset, xfactor, dw, mbInterpOnly);
+			break;
+
+		case kFilterCubic:
+			gen.cubich(xoffset, xfactor, dw, (float)mSplineFactor, mbInterpOnly);
+			break;
+
+		case kFilterLanczos3:
+			gen.lanczos3h(xoffset, xfactor, dw);
+			break;
+	}
+
+	switch(mFilterV) {
+		case kFilterPoint:
+			gen.pointv(yoffset, yfactor, dh);
+			break;
+
+		case kFilterLinear:
+			gen.linearv(yoffset, yfactor, dh, mbInterpOnly);
+			break;
+
+		case kFilterCubic:
+			gen.cubicv(yoffset, yfactor, dh, (float)mSplineFactor, mbInterpOnly);
+			break;
+
+		case kFilterLanczos3:
+			gen.lanczos3v(yoffset, yfactor, dh);
+			break;
+	}
+}
+
+bool VDPixmapResample(const VDPixmap& dst, const VDPixmap& src, IVDPixmapResampler::FilterMode filter) {
+	VDPixmapResampler r;
+
+	r.SetFilters(filter, filter, false);
+
+	if (!r.Init(dst.w, dst.h, dst.format, src.w, src.h, src.format))
+		return false;
+
+	r.Process(dst, src);
+	return true;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp
new file mode 100644
index 000000000..010364e1a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp
@@ -0,0 +1,255 @@
+#include <math.h>
+#include <vd2/Kasumi/resample_kernels.h>
+
+///////////////////////////////////////////////////////////////////////////
+//
+// utility functions
+//
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+	inline sint32 scale32x32_fp16(sint32 x, sint32 y) {
+		return (sint32)(((sint64)x * y + 0x8000) >> 16);
+	}
+
+	inline double sinc(double x) {
+		return fabs(x) < 1e-9 ? 1.0 : sin(x) / x;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerAxis
+//
+///////////////////////////////////////////////////////////////////////////
+
+void VDResamplerAxis::Init(sint32 dudx) {
+	this->dudx = dudx;
+}
+
+void VDResamplerAxis::Compute(sint32 count, sint32 u0, sint32 w, sint32 kernel_width) {
+	u = u0;
+	dx = count;
+
+	sint32 du_kern	= (kernel_width-1) << 16;
+	sint32 u2		= u + dudx*(dx-1);
+	sint32 u_limit	= w << 16;
+
+	dx_precopy	= 0;
+	dx_preclip	= 0;
+	dx_active	= 0;
+	dx_postclip	= 0;
+	dx_postcopy = 0;
+	dx_dualclip	= 0;
+
+	sint32 dx_temp = dx;
+	sint32 u_start = u;
+
+	// (desired - u0 + (dudx-1)) / dudx : first pixel >= desired
+
+	sint32 dudx_m1_mu0	= dudx - 1 - u;
+	sint32 first_preclip	= (dudx_m1_mu0 + 0x10000 - du_kern) / dudx;
+	sint32 first_active		= (dudx_m1_mu0                    ) / dudx;
+	sint32 first_postclip	= (dudx_m1_mu0 + u_limit - du_kern) / dudx;
+	sint32 first_postcopy	= (dudx_m1_mu0 + u_limit - 0x10000) / dudx;
+
+	// clamp
+	if (first_preclip < 0)
+		first_preclip = 0;
+	if (first_active < first_preclip)
+		first_active = first_preclip;
+	if (first_postclip < first_active)
+		first_postclip = first_active;
+	if (first_postcopy < first_postclip)
+		first_postcopy = first_postclip;
+	if (first_preclip > dx)
+		first_preclip = dx;
+	if (first_active > dx)
+		first_active = dx;
+	if (first_postclip > dx)
+		first_postclip = dx;
+	if (first_postcopy > dx)
+		first_postcopy = dx;
+
+	// determine widths
+
+	dx_precopy	= first_preclip;
+	dx_preclip	= first_active - first_preclip;
+	dx_active	= first_postclip - first_active;
+	dx_postclip	= first_postcopy - first_postclip;
+	dx_postcopy	= dx - first_postcopy;
+
+	// sanity checks
+	sint32 pos0 = dx_precopy;
+	sint32 pos1 = pos0 + dx_preclip;
+	sint32 pos2 = pos1 + dx_active;
+	sint32 pos3 = pos2 + dx_postclip;
+
+	VDASSERT(!((dx_precopy|dx_preclip|dx_active|dx_postcopy|dx_postclip) & 0x80000000));
+	VDASSERT(dx_precopy + dx_preclip + dx_active + dx_postcopy + dx_postclip == dx);
+
+	VDASSERT(!pos0			|| u_start + dudx*(pos0 - 1) <  0x10000 - du_kern);	// precopy -> preclip
+	VDASSERT( pos0 >= pos1	|| u_start + dudx*(pos0    ) >= 0x10000 - du_kern);
+	VDASSERT( pos1 <= pos0	|| u_start + dudx*(pos1 - 1) <  0);					// preclip -> active
+	VDASSERT( pos1 >= pos2	|| u_start + dudx*(pos1    ) >= 0 || !dx_active);
+	VDASSERT( pos2 <= pos1	|| u_start + dudx*(pos2 - 1) <  u_limit - du_kern || !dx_active);	// active -> postclip
+	VDASSERT( pos2 >= pos3	|| u_start + dudx*(pos2    ) >= u_limit - du_kern);
+	VDASSERT( pos3 <= pos2	|| u_start + dudx*(pos3 - 1) <  u_limit - 0x10000);	// postclip -> postcopy
+	VDASSERT( pos3 >= dx	|| u_start + dudx*(pos3    ) >= u_limit - 0x10000);
+
+	u += dx_precopy * dudx;
+
+	// test for overlapping clipping regions
+	if (!dx_active && kernel_width > w) {
+		dx_dualclip = dx_preclip + dx_postclip;
+		dx_preclip = dx_postclip = 0;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerLinearFilter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerLinearFilter::VDResamplerLinearFilter(double twofc)
+	: mScale(twofc)
+	, mTaps((int)ceil(1.0 / twofc) * 2)
+{
+}
+
+int VDResamplerLinearFilter::GetFilterWidth() const {
+	return mTaps;
+}
+
+double VDResamplerLinearFilter::EvaluateFilter(double t) const {
+	t = 1.0f - fabs(t)*mScale;
+
+	return t + fabs(t);
+}
+
+void VDResamplerLinearFilter::GenerateFilter(float *dst, double offset) const {
+	double pos = -((double)((mTaps>>1)-1) + offset) * mScale;
+
+	for(unsigned i=0; i<mTaps; ++i) {
+		double t = 1.0 - fabs(pos);
+
+		*dst++ = (float)(t+fabs(t));
+		pos += mScale;
+	}
+}
+
+void VDResamplerLinearFilter::GenerateFilterBank(float *dst) const {
+	for(int offset=0; offset<256; ++offset) {
+		GenerateFilter(dst, offset * (1.0f / 256.0f));
+		dst += mTaps;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerCubicFilter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerCubicFilter::VDResamplerCubicFilter(double twofc, double A)
+	: mScale(twofc)
+	, mA0( 1.0  )
+	, mA2(-3.0-A)
+	, mA3( 2.0+A)
+	, mB0(-4.0*A)
+	, mB1( 8.0*A)
+	, mB2(-5.0*A)
+	, mB3(     A)
+	, mTaps((int)ceil(2.0 / twofc)*2)
+{
+}
+
+int VDResamplerCubicFilter::GetFilterWidth() const { return mTaps; }
+
+double VDResamplerCubicFilter::EvaluateFilter(double t) const {
+	t = fabs(t)*mScale;
+
+	if (t < 1.0)
+		return mA0 + (t*t)*(mA2 + t*mA3);
+	else if (t < 2.0)
+		return mB0 + t*(mB1 + t*(mB2 + t*mB3));
+	else
+		return 0;
+}
+
+void VDResamplerCubicFilter::GenerateFilter(float *dst, double offset) const {
+	double pos = -((double)((mTaps>>1)-1) + offset) * mScale;
+
+	for(unsigned i=0; i<mTaps; ++i) {
+		double t = fabs(pos);
+		double v = 0;
+
+		if (t < 1.0)
+			v = mA0 + (t*t)*(mA2 + t*mA3);
+		else if (t < 2.0)
+			v = mB0 + t*(mB1 + t*(mB2 + t*mB3));
+
+		*dst++ = (float)v;
+		pos += mScale;
+	}
+}
+
+void VDResamplerCubicFilter::GenerateFilterBank(float *dst) const {
+	for(int offset=0; offset<256; ++offset) {
+		GenerateFilter(dst, offset * (1.0f / 256.0f));
+		dst += mTaps;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerLanczos3Filter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerLanczos3Filter::VDResamplerLanczos3Filter(double twofc)
+	: mScale(twofc)
+	, mTaps((int)ceil(3.0 / twofc)*2)
+{
+}
+
+int VDResamplerLanczos3Filter::GetFilterWidth() const {
+	return mTaps;
+}
+
+double VDResamplerLanczos3Filter::EvaluateFilter(double t) const {
+	static const double pi  = 3.1415926535897932384626433832795;	// pi
+	static const double pi3 = 1.0471975511965977461542144610932;	// pi/3
+
+	t *= mScale;
+
+	if (fabs(t) < 3.0)
+		return sinc(pi*t) * sinc(pi3*t);
+	else
+		return 0.0;
+}
+
+void VDResamplerLanczos3Filter::GenerateFilter(float *dst, double offset) const {
+	static const double pi  = 3.1415926535897932384626433832795;	// pi
+	static const double pi3 = 1.0471975511965977461542144610932;	// pi/3
+
+	double t = -(((double)((mTaps>>1)-1) + offset) * mScale);
+
+	for(unsigned i=0; i<mTaps; ++i) {
+		double v = 0;
+
+		if (fabs(t) < 3.0)
+			v = sinc(pi*t) * sinc(pi3*t);
+
+		*dst++ = (float)v;
+		t += mScale;
+	}
+}
+
+void VDResamplerLanczos3Filter::GenerateFilterBank(float *dst) const {
+	for(int offset=0; offset<256; ++offset) {
+		GenerateFilter(dst, offset * (1.0f / 256.0f));
+		dst += mTaps;
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp
new file mode 100644
index 000000000..fcea6c669
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp
@@ -0,0 +1,149 @@
+#include <vd2/system/math.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/resample_kernels.h>
+#include "resample_stages.h"
+
+VDSteppedAllocator::VDSteppedAllocator(size_t initialSize)
+	: mpHead(NULL)
+	, mpAllocNext(NULL)
+	, mAllocLeft(0)
+	, mAllocNext(initialSize)
+	, mAllocInit(initialSize)
+{
+}
+
+VDSteppedAllocator::~VDSteppedAllocator() {
+	clear();
+}
+
+void VDSteppedAllocator::clear() {
+	while(Block *p = mpHead) {
+		mpHead = mpHead->next;
+		free(p);
+	}
+	mAllocLeft = 0;
+	mAllocNext = mAllocInit;
+}
+
+void *VDSteppedAllocator::allocate(size_type n) {
+	n = (n+15) & ~15;
+	if (mAllocLeft < n) {
+		mAllocLeft = mAllocNext;
+		mAllocNext += (mAllocNext >> 1);
+		if (mAllocLeft < n)
+			mAllocLeft = n;
+
+		Block *t = (Block *)malloc(sizeof(Block) + mAllocLeft);
+
+		if (mpHead)
+			mpHead->next = t;
+
+		mpHead = t;
+		mpHead->next = NULL;
+
+		mpAllocNext = (char *)(mpHead + 1);
+	}
+
+	void *p = mpAllocNext;
+	mpAllocNext += n;
+	mAllocLeft -= n;
+	return p;
+}
+
+void VDResamplerGenerateTable(sint32 *dst, const IVDResamplerFilter& filter) {
+	const unsigned width = filter.GetFilterWidth();
+	vdblock<float> filters(width * 256);
+	float *src = filters.data();
+
+	filter.GenerateFilterBank(src);
+
+	for(unsigned phase=0; phase < 256; ++phase) {
+		float sum = 0;
+
+		for(unsigned i=0; i<width; ++i)
+			sum += src[i];
+
+		float scalefac = 16384.0f / sum;
+
+		for(unsigned j=0; j<width; j += 2) {
+			int v0 = VDRoundToIntFast(src[j+0] * scalefac);
+			int v1 = VDRoundToIntFast(src[j+1] * scalefac);
+
+			dst[j+0] = v0;
+			dst[j+1] = v1;
+		}
+
+		src += width;
+		dst += width;
+	}
+}
+
+void VDResamplerGenerateTableF(float *dst, const IVDResamplerFilter& filter) {
+	const unsigned width = filter.GetFilterWidth();
+	filter.GenerateFilterBank(dst);
+
+	for(unsigned phase=0; phase < 256; ++phase) {
+		float sum = 0;
+
+		for(unsigned i=0; i<width; ++i)
+			sum += dst[i];
+
+		float scalefac = 1.0f / sum;
+
+		for(unsigned j=0; j<width; ++j)
+			*dst++ *= scalefac;
+	}
+}
+
+void VDResamplerGenerateTable2(sint32 *dst, const IVDResamplerFilter& filter, sint32 count, sint32 u0, sint32 dudx) {
+	const unsigned width = filter.GetFilterWidth();
+	vdblock<float> filters(width);
+	float *src = filters.data();
+
+	filter.GenerateFilterBank(src);
+
+	for(sint32 i=0; i<count; ++i) {
+		sint32 u = u0 + dudx*i;
+
+		*dst++ = u >> 16;
+		filter.GenerateFilter(src, (double)(u & 0xffff) / 65536.0);
+
+		float sum = 0;
+		for(uint32 j=0; j<width; ++j)
+			sum += src[j];
+
+		float scalefac = 16384.0f / sum;
+
+		sint32 isum = 0;
+		for(uint32 j=0; j<width; ++j) {
+			sint32 v = VDRoundToIntFast(src[j] * scalefac);
+
+			dst[j] = v;
+			isum += v;
+		}
+
+		sint32 ierr = 16384 - isum;
+		sint32 idelta = 2*(ierr >> 31) - 1;
+		while(ierr) {
+			for(uint32 j=0; j<width && ierr; ++j) {
+				if (!dst[j])
+					continue;
+
+				dst[j] += idelta;
+				ierr -= idelta;
+			}
+		}
+
+		dst += width;
+	}
+}
+
+void VDResamplerSwizzleTable(sint32 *dst, unsigned pairs) {
+	do {
+		sint32 v0 = dst[0];
+		sint32 v1 = dst[1];
+
+		dst[0] = dst[1] = (v0 & 0xffff) + (v1<<16);
+		dst += 2;
+	} while(--pairs);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp
new file mode 100644
index 000000000..94bee7c9e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp
@@ -0,0 +1,425 @@
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "resample_stages_reference.h"
+#include <vd2/Kasumi/resample_kernels.h>
+#include "blt_spanutils.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int VDResamplerRowStageSeparablePoint8::GetWindowSize() const {
+	return 1;
+}
+
+void VDResamplerRowStageSeparablePoint8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	do {
+		*dst++ = src[u>>16];
+		u += dudx;
+	} while(--w);
+}
+
+int VDResamplerRowStageSeparablePoint16::GetWindowSize() const {
+	return 1;
+}
+
+void VDResamplerRowStageSeparablePoint16::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint16 *dst = (uint16 *)dst0;
+	const uint16 *src = (const uint16 *)src0;
+
+	do {
+		*dst++ = src[u>>16];
+		u += dudx;
+	} while(--w);
+}
+
+int VDResamplerRowStageSeparablePoint32::GetWindowSize() const {
+	return 1;
+}
+
+void VDResamplerRowStageSeparablePoint32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+
+	do {
+		*dst++ = src[u>>16];
+		u += dudx;
+	} while(--w);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int VDResamplerRowStageSeparableLinear8::GetWindowSize() const {return 2;}
+void VDResamplerRowStageSeparableLinear8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	do {
+		const sint32 iu = u>>16;
+		const uint32 p0 = src[iu];
+		const uint32 p1 = src[iu+1];
+		const uint32 f = (u >> 8) & 0xff;
+
+		*dst++	= (uint8)(p0 + (((sint32)(p1 - p0)*f + 0x80)>>8));
+		u += dudx;
+	} while(--w);
+}
+
+void VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	VDASSERT(!u && dudx == 0x8000);
+
+	nsVDPixmapSpanUtils::horiz_expand2x_coaligned(dst, src, w);
+}
+
+int VDResamplerRowStageSeparableLinear32::GetWindowSize() const {return 2;}
+void VDResamplerRowStageSeparableLinear32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+
+	do {
+		const sint32 iu = u>>16;
+		const uint32 p0 = src[iu];
+		const uint32 p1 = src[iu+1];
+		const uint32 f = (u >> 8) & 0xff;
+
+		const uint32 p0_rb = p0 & 0xff00ff;
+		const uint32 p1_rb = p1 & 0xff00ff;
+		const uint32 p0_g = p0 & 0xff00;
+		const uint32 p1_g = p1 & 0xff00;
+
+		*dst++	= ((p0_rb + (((p1_rb - p0_rb)*f + 0x800080)>>8)) & 0xff00ff)
+				+ ((p0_g  + (((p1_g  - p0_g )*f + 0x008000)>>8)) & 0x00ff00);
+		u += dudx;
+	} while(--w);
+}
+
+int VDResamplerColStageSeparableLinear8::GetWindowSize() const {return 2;}
+void VDResamplerColStageSeparableLinear8::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src0 = (const uint8 *)srcarray[0];
+	const uint8 *src1 = (const uint8 *)srcarray[1];
+	const uint32 f = (phase >> 8) & 0xff;
+
+	do {
+		const uint32 p0 = *src0++;
+		const uint32 p1 = *src1++;
+
+		*dst++ = (uint8)(p0 + (((p1 - p0)*f + 0x80)>>8));
+	} while(--w);
+}
+
+int VDResamplerColStageSeparableLinear32::GetWindowSize() const {return 2;}
+void VDResamplerColStageSeparableLinear32::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src0 = (const uint32 *)srcarray[0];
+	const uint32 *src1 = (const uint32 *)srcarray[1];
+	const uint32 f = (phase >> 8) & 0xff;
+
+	do {
+		const uint32 p0 = *src0++;
+		const uint32 p1 = *src1++;
+
+		const uint32 p0_rb = p0 & 0xff00ff;
+		const uint32 p1_rb = p1 & 0xff00ff;
+		const uint32 p0_g = p0 & 0xff00;
+		const uint32 p1_g = p1 & 0xff00;
+
+		*dst++	= ((p0_rb + (((p1_rb - p0_rb)*f + 0x800080)>>8)) & 0xff00ff)
+				+ ((p0_g  + (((p1_g  - p0_g )*f + 0x008000)>>8)) & 0x00ff00);
+	} while(--w);
+}
+
+VDResamplerRowStageSeparableTable8::VDResamplerRowStageSeparableTable8(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable8::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+	const unsigned ksize = (int)mFilterBank.size() >> 8;
+	const sint32 *filterBase = mFilterBank.data();
+
+	do {
+		const uint8 *src2 = src + (u>>16);
+		const sint32 *filter = filterBase + ksize*((u>>8)&0xff);
+		u += dudx;
+
+		int b = 0x2000;
+		for(unsigned i = ksize; i; --i) {
+			uint8 p = *src2++;
+			sint32 coeff = *filter++;
+
+			b += (sint32)p*coeff;
+		}
+
+		b >>= 14;
+
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		*dst++ = (uint8)b;
+	} while(--w);
+}
+
+VDResamplerRowStageSeparableTable32::VDResamplerRowStageSeparableTable32(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *src = (const uint32 *)src0;
+	const unsigned ksize = (int)mFilterBank.size() >> 8;
+	const sint32 *filterBase = mFilterBank.data();
+
+	do {
+		const uint32 *src2 = src + (u>>16);
+		const sint32 *filter = filterBase + ksize*((u>>8)&0xff);
+		u += dudx;
+
+		int r = 0x2000, g = 0x2000, b = 0x2000;
+		for(unsigned i = ksize; i; --i) {
+			uint32 p = *src2++;
+			sint32 coeff = *filter++;
+
+			r += ((p>>16)&0xff)*coeff;
+			g += ((p>> 8)&0xff)*coeff;
+			b += ((p    )&0xff)*coeff;
+		}
+
+		r <<= 2;
+		g >>= 6;
+		b >>= 14;
+
+		if ((uint32)r >= 0x01000000)
+			r = ~r >> 31;
+		if ((uint32)g >= 0x00010000)
+			g = ~g >> 31;
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		*dst++ = (r & 0xff0000) + (g & 0xff00) + (b & 0xff);
+	} while(--w);
+}
+
+VDResamplerRowStageSeparableTable32Fx4::VDResamplerRowStageSeparableTable32Fx4(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32Fx4::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32Fx4::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	float *dst = (float *)dst0;
+	const float *src = (const float *)src0;
+	const unsigned ksize = (int)mFilterBank.size() >> 8;
+	const float *filterBase = mFilterBank.data();
+
+	do {
+		const float *src2 = src + (u>>16)*4;
+		const float *filter = filterBase + ksize*((u>>8)&0xff);
+		u += dudx;
+
+		float r = 0, g = 0, b = 0, a = 0;
+		for(unsigned i = ksize; i; --i) {
+			float coeff = *filter++;
+
+			r += coeff * src2[0];
+			g += coeff * src2[1];
+			b += coeff * src2[2];
+			a += coeff * src2[3];
+			src2 += 4;
+		}
+
+		dst[0] = r;
+		dst[1] = g;
+		dst[2] = b;
+		dst[3] = a;
+		dst += 4;
+	} while(--w);
+}
+
+VDResamplerRowStageSeparableTable32F::VDResamplerRowStageSeparableTable32F(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32F::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32F::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	float *dst = (float *)dst0;
+	const float *src = (const float *)src0;
+	const unsigned ksize = (int)mFilterBank.size() >> 8;
+	const float *filterBase = mFilterBank.data();
+
+	VDCPUCleanupExtensions();
+
+	do {
+		const float *src2 = src + (u>>16);
+		const float *filter = filterBase + ksize*((u>>8)&0xff);
+		u += dudx;
+
+		float r = 0;
+		for(unsigned i = ksize; i; --i) {
+			float coeff = *filter++;
+
+			r += coeff * src2[0];
+			++src2;
+		}
+
+		dst[0] = r;
+		++dst;
+	} while(--w);
+}
+
+VDResamplerColStageSeparableTable8::VDResamplerColStageSeparableTable8(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable8::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable8::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *const *src = (const uint8 *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const sint32 *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+	for(uint32 i=0; i<w; ++i) {
+		int b = 0x2000;
+		const sint32 *filter2 = filter;
+		const uint8 *const *src2 = src;
+
+		for(unsigned j = ksize; j; --j) {
+			sint32 p = (*src2++)[i];
+			sint32 coeff = *filter2++;
+
+			b += p*coeff;
+		}
+
+		b >>= 14;
+
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		*dst++ = (uint8)b;
+	}
+}
+
+VDResamplerColStageSeparableTable32::VDResamplerColStageSeparableTable32(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	uint32 *dst = (uint32 *)dst0;
+	const uint32 *const *src = (const uint32 *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const sint32 *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+	for(uint32 i=0; i<w; ++i) {
+		int r = 0x2000, g = 0x2000, b = 0x2000;
+		const sint32 *filter2 = filter;
+		const uint32 *const *src2 = src;
+
+		for(unsigned j = ksize; j; --j) {
+			uint32 p = (*src2++)[i];
+			sint32 coeff = *filter2++;
+
+			r += ((p>>16)&0xff)*coeff;
+			g += ((p>> 8)&0xff)*coeff;
+			b += ((p    )&0xff)*coeff;
+		}
+
+		r <<= 2;
+		g >>= 6;
+		b >>= 14;
+
+		if ((uint32)r >= 0x01000000)
+			r = ~r >> 31;
+		if ((uint32)g >= 0x00010000)
+			g = ~g >> 31;
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		*dst++ = (r & 0xff0000) + (g & 0xff00) + (b & 0xff);
+	}
+}
+
+VDResamplerColStageSeparableTable32F::VDResamplerColStageSeparableTable32F(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32F::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32F::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	float *dst = (float *)dst0;
+	const float *const *src = (const float *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const float *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+	for(uint32 i=0; i<w; ++i) {
+		float r = 0;
+		const float *filter2 = filter;
+		const float *const *src2 = src;
+
+		for(unsigned j = ksize; j; --j) {
+			const float *p = (*src2++) + i;
+			float coeff = *filter2++;
+
+			r += p[0]*coeff;
+		}
+
+		dst[0] = r;
+		++dst;
+	}
+}
+
+VDResamplerColStageSeparableTable32Fx4::VDResamplerColStageSeparableTable32Fx4(const IVDResamplerFilter& filter) {
+	mFilterBank.resize(filter.GetFilterWidth() * 256);
+	VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32Fx4::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32Fx4::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	float *dst = (float *)dst0;
+	const float *const *src = (const float *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const float *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+	for(uint32 i=0; i<w; ++i) {
+		float r = 0, g = 0, b = 0, a = 0;
+		const float *filter2 = filter;
+		const float *const *src2 = src;
+
+		for(unsigned j = ksize; j; --j) {
+			const float *p = (*src2++) + i*4;
+			float coeff = *filter2++;
+
+			r += p[0]*coeff;
+			g += p[1]*coeff;
+			b += p[2]*coeff;
+			a += p[3]*coeff;
+		}
+
+		dst[0] = r;
+		dst[1] = g;
+		dst[2] = b;
+		dst[3] = a;
+		dst += 4;
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp
new file mode 100644
index 000000000..a206d37d8
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp
@@ -0,0 +1,26 @@
+#include "resample_stages_x64.h"
+
+extern "C" long vdasm_resize_table_col_SSE2(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w);
+extern "C" long vdasm_resize_table_row_SSE2(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+
+VDResamplerSeparableTableRowStageSSE2::VDResamplerSeparableTableRowStageSSE2(const IVDResamplerFilter& filter)
+	: VDResamplerRowStageSeparableTable32(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (uint32)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableRowStageSSE2::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_table_row_SSE2((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableColStageSSE2::VDResamplerSeparableTableColStageSSE2(const IVDResamplerFilter& filter)
+	: VDResamplerColStageSeparableTable32(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (uint32)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStageSSE2::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+	const unsigned filtSize = (unsigned)mFilterBank.size() >> 8;
+
+	vdasm_resize_table_col_SSE2((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data() + filtSize*((phase >> 8) & 0xff), filtSize, w);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp
new file mode 100644
index 000000000..bc4db574f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp
@@ -0,0 +1,1277 @@
+#include <numeric>
+#include "blt_spanutils_x86.h"
+#include "resample_stages_x86.h"
+#include <vd2/Kasumi/resample_kernels.h>
+
+#ifdef _MSC_VER
+	#pragma warning(disable: 4799)		// warning C4799: function 'vdasm_resize_table_row_8_k8_4x_MMX' has no EMMS instruction
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern "C" void vdasm_resize_table_row_8_k8_4x_SSE41(void *dst, const void *src, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_row_8_k16_4x_SSE41(void *dst, const void *src, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_row_8_SSE41(void *dst, const void *src, uint32 width, const void *kernel, uint32 kwidth);
+extern "C" void vdasm_resize_table_col_8_k2_SSE41(void *dst, const void *const *srcs, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_col_8_k4_SSE41(void *dst, const void *const *srcs, uint32 width, const void *kernel);
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+	struct ScaleInfo {
+		void *dst;
+		uintptr	src;
+		uint32	accum;
+		uint32	fracinc;
+		sint32	intinc;
+		uint32	count;
+	};
+
+	extern "C" void vdasm_resize_point32(const ScaleInfo *);
+}
+
+int VDResamplerSeparablePointRowStageX86::GetWindowSize() const {return 1;}
+void VDResamplerSeparablePointRowStageX86::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	ScaleInfo info;
+
+	info.dst = (uint32 *)dst + w;
+	info.src = ((uintptr)src >> 2) + (u>>16);
+	info.accum = u<<16;
+	info.fracinc = dudx << 16;
+	info.intinc = (sint32)dudx >> 16;
+	info.count = -(sint32)w*4;
+
+	vdasm_resize_point32(&info);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf_ISSE::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *src = (const uint8 *)src0;
+
+	nsVDPixmapSpanUtils::horiz_expand2x_coaligned_ISSE(dst, src, w);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern "C" void vdasm_resize_point32_MMX(const ScaleInfo *);
+extern "C" void vdasm_resize_interp_row_run_MMX(void *dst, const void *src, uint32 width, sint64 xaccum, sint64 x_inc);
+extern "C" void vdasm_resize_interp_col_run_MMX(void *dst, const void *src1, const void *src2, uint32 width, uint32 yaccum);
+extern "C" void vdasm_resize_ccint_row_MMX(void *dst, const void *src, uint32 count, uint32 xaccum, sint32 xinc, const void *tbl);
+extern "C" void vdasm_resize_ccint_col_MMX(void *dst, const void *src1, const void *src2, const void *src3, const void *src4, uint32 count, const void *tbl);
+extern "C" long vdasm_resize_table_col_MMX(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w, long frac);
+extern "C" long vdasm_resize_table_row_MMX(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+
+int VDResamplerSeparablePointRowStageMMX::GetWindowSize() const {return 1;}
+void VDResamplerSeparablePointRowStageMMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	ScaleInfo info;
+
+	info.dst = (uint32 *)dst + w;
+	info.src = ((uintptr)src >> 2) + (u>>16);
+	info.accum = u<<16;
+	info.fracinc = dudx << 16;
+	info.intinc = (sint32)dudx >> 16;
+	info.count = -(sint32)w*4;
+
+	vdasm_resize_point32_MMX(&info);
+}
+
+int VDResamplerSeparableLinearRowStageMMX::GetWindowSize() const {return 2;}
+void VDResamplerSeparableLinearRowStageMMX::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_interp_row_run_MMX(dst0, src0, w, (sint64)u << 16, (sint64)dudx << 16);
+}
+
+int VDResamplerSeparableLinearColStageMMX::GetWindowSize() const {return 2;}
+void VDResamplerSeparableLinearColStageMMX::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+	vdasm_resize_interp_col_run_MMX(dst0, srcarray[0], srcarray[1], w, phase);
+}
+
+VDResamplerSeparableCubicRowStageMMX::VDResamplerSeparableCubicRowStageMMX(double A)
+	: mFilterBank(1024)
+{
+	sint32 *p = mFilterBank.data();
+	VDResamplerGenerateTable(p, VDResamplerCubicFilter(1.0, A));
+	VDResamplerSwizzleTable(p, 512);
+}
+
+int VDResamplerSeparableCubicRowStageMMX::GetWindowSize() const {return 4;}
+void VDResamplerSeparableCubicRowStageMMX::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_ccint_row_MMX(dst0, src0, w, u, dudx, mFilterBank.data());
+}
+
+VDResamplerSeparableCubicColStageMMX::VDResamplerSeparableCubicColStageMMX(double A)
+	: mFilterBank(1024)
+{
+	sint32 *p = mFilterBank.data();
+	VDResamplerGenerateTable(p, VDResamplerCubicFilter(1.0, A));
+	VDResamplerSwizzleTable(p, 512);
+}
+
+int VDResamplerSeparableCubicColStageMMX::GetWindowSize() const {return 4;}
+void VDResamplerSeparableCubicColStageMMX::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+	vdasm_resize_ccint_col_MMX(dst0, srcarray[0], srcarray[1], srcarray[2], srcarray[3], w, mFilterBank.data() + ((phase>>6)&0x3fc));
+}
+
+VDResamplerSeparableTableRowStage8MMX::VDResamplerSeparableTableRowStage8MMX(const IVDResamplerFilter& filter)
+	: VDResamplerRowStageSeparableTable32(filter)
+	, mLastSrcWidth(0)
+	, mLastDstWidth(0)
+	, mLastU(0)
+	, mLastDUDX(0)
+{
+	mAlignedKernelWidth = (GetWindowSize() + 6) & ~3;
+	mAlignedKernelSize = mAlignedKernelWidth + 4;
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Init(const VDResamplerAxis& axis, uint32 srcw) {
+	uint32 w = axis.dx_preclip + axis.dx_active + axis.dx_postclip + axis.dx_dualclip;
+
+	if (mLastSrcWidth != srcw || mLastDstWidth != w || mLastU != axis.u || mLastDUDX != axis.dudx) {
+		mLastSrcWidth	= srcw;
+		mLastDstWidth	= w;
+		mLastU			= axis.u;
+		mLastDUDX		= axis.dudx;
+
+		RedoRowFilters(axis, w, srcw);
+	}
+}
+
+void VDResamplerSeparableTableRowStage8MMX::RedoRowFilters(const VDResamplerAxis& axis, uint32 w, uint32 srcw) {
+	int kstride = mFilterBank.size() >> 8;
+	int ksize = mAlignedKernelWidth;
+	int kesize = mAlignedKernelSize;
+
+	mRowKernels.clear();
+	mRowKernelSize = w * kesize;
+
+	mRowKernels.resize(mRowKernelSize * 4, 0);
+
+	for(int byteOffset = 0; byteOffset < 4; ++byteOffset) {
+		sint16 *dst = mRowKernels.data() + mRowKernelSize * byteOffset;
+		int ksizeThisOffset = std::min<int>(ksize, (byteOffset + srcw + 3) & ~3);
+
+		mKernelSizeByOffset[byteOffset] = ksizeThisOffset;
+
+		sint32 u = axis.u;
+		sint32 uoffmin = -byteOffset;
+		sint32 uoffmax = ((srcw + byteOffset + 3) & ~3) - byteOffset - ksizeThisOffset;
+		for(uint32 i=0; i<w; ++i) {
+			sint32 uoffset = u >> 16;
+			sint32 uoffset2 = ((uoffset + byteOffset) & ~3) - byteOffset;
+
+			if (uoffset2 < uoffmin)
+				uoffset2 = uoffmin;
+
+			if (uoffset2 > uoffmax)
+				uoffset2 = uoffmax;
+
+			VDASSERT(uoffset2 + ksizeThisOffset <= (((sint32)srcw + byteOffset + 3) & ~3));
+
+			*(sint32 *)dst = uoffset2;
+			dst += 2;
+			*dst++ = 0;
+			*dst++ = 0;
+
+			uint32 phase = (u >> 8) & 255;
+			const sint32 *src = &mFilterBank[kstride * phase];
+
+			sint32 start = 0;
+			sint32 end = kstride;
+
+			int dstoffset = uoffset - uoffset2;
+
+			// check for filter kernel overlapping left source boundary
+			if (uoffset < 0)
+				start = -uoffset;
+
+			// check for filter kernel overlapping right source boundary
+			if (uoffset + end > (sint32)srcw)
+				end = srcw - uoffset;
+
+			VDASSERT(dstoffset + start >= 0);
+			VDASSERT(dstoffset + end <= ksizeThisOffset);
+
+			sint16 *dst2 = dst + dstoffset;
+			dst += ksizeThisOffset;
+
+			for(int j=start; j<end; ++j)
+				dst2[j] = src[j];
+
+			if (start > 0)
+				dst2[start] = std::accumulate(src, src+start, dst2[start]);
+
+			if (end < kstride)
+				dst2[end - 1] = std::accumulate(src+end, src+kstride, dst2[end - 1]);
+
+			u += axis.dudx;
+		}
+	}
+
+	// swizzle rows where optimization is possible
+	vdfastvector<sint16> temp;
+
+	int quads = w >> 2;
+	int quadRemainder = w & 3;
+
+	for(int byteOffset = 0; byteOffset < 4; ++byteOffset) {
+		int ksizeThisOffset = mKernelSizeByOffset[byteOffset];
+		int kpairs = ksizeThisOffset >> 2;
+
+		if (ksizeThisOffset < 8 || ksizeThisOffset > 12) {
+			mbQuadOptimizationEnabled[byteOffset] = false;
+		} else {
+			ptrdiff_t unswizzledStride = (ksizeThisOffset >> 1) + 2;
+
+			mbQuadOptimizationEnabled[byteOffset] = true;
+			mTailOffset[byteOffset] = quads * (8 + ksizeThisOffset*4);
+
+			uint32 *dst = (uint32 *)&mRowKernels[mRowKernelSize * byteOffset];
+			temp.resize(mRowKernelSize);
+			memcpy(temp.data(), dst, mRowKernelSize*2);
+
+			const uint32 *src0 = (const uint32 *)temp.data();
+			const uint32 *src1 = src0 + unswizzledStride;
+			const uint32 *src2 = src1 + unswizzledStride;
+			const uint32 *src3 = src2 + unswizzledStride;
+			ptrdiff_t srcskip = unswizzledStride * 3;
+
+			for(int q = 0; q < quads; ++q) {
+				dst[0] = src0[0];
+				dst[1] = src1[0];
+				dst[2] = src2[0];
+				dst[3] = src3[0];
+				src0 += 2;
+				src1 += 2;
+				src2 += 2;
+				src3 += 2;
+				dst += 4;
+
+				for(int p = 0; p < kpairs; ++p) {
+					dst[0] = src0[0];
+					dst[1] = src0[1];
+					dst[2] = src1[0];
+					dst[3] = src1[1];
+					dst[4] = src2[0];
+					dst[5] = src2[1];
+					dst[6] = src3[0];
+					dst[7] = src3[1];
+					dst += 8;
+					src0 += 2;
+					src1 += 2;
+					src2 += 2;
+					src3 += 2;
+				}
+
+				src0 += srcskip;
+				src1 += srcskip;
+				src2 += srcskip;
+				src3 += srcskip;
+			}
+
+			memcpy(dst, src0, unswizzledStride * 4 * quadRemainder);
+
+			VDASSERT(dst + unswizzledStride * quadRemainder <= (void *)(mRowKernels.data() + (mRowKernelSize * (byteOffset + 1))));
+		}
+	}
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_k8_4x_MMX(void *dst, const void *src, uint32 width, const void *kernel) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000000000002000;
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			ebp, [esp +  4 + 16]		;ebp = dst
+		mov			esi, [esp + 12 + 16]		;esi = width
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+yloop:
+		;eax = temp
+		;ebx = temp
+		;ecx = temp
+		;edx = temp
+		;esi = horiz counter
+		;edi = filter list
+		;ebp = destination
+
+		mov			eax, [edi+0]
+		mov			ebx, [edi+4]
+		mov			ecx, [edi+8]
+		mov			edx, [esp+8+16]
+		add			eax, edx
+		add			ebx, edx
+		add			ecx, edx
+		add			edx, [edi+12]
+
+		movd		mm0, [eax]
+		punpcklbw	mm0, mm7
+
+		pmaddwd		mm0, [edi+16]
+		movd		mm1, [ebx]
+		punpcklbw	mm1, mm7
+
+		pmaddwd		mm1, [edi+24]
+		movd		mm2, [ecx]
+		punpcklbw	mm2, mm7
+
+		pmaddwd		mm2, [edi+32]
+		movd		mm3, [edx]
+		punpcklbw	mm3, mm7
+
+		pmaddwd		mm3, [edi+40]
+		movd		mm4, [eax+4]
+		paddd		mm0, mm6
+
+		movd		mm5, [ebx+4]
+		punpcklbw	mm4, mm7
+		paddd		mm1, mm6
+
+		pmaddwd		mm4, [edi+48]
+		punpcklbw	mm5, mm7
+		paddd		mm2, mm6
+
+		pmaddwd		mm5, [edi+56]
+		paddd		mm3, mm6
+		paddd		mm0, mm4
+
+		paddd		mm1, mm5
+		movd		mm4, [ecx+4]
+		punpcklbw	mm4, mm7
+
+		pmaddwd		mm4, [edi+64]
+		movd		mm5, [edx+4]
+		punpcklbw	mm5, mm7
+
+		pmaddwd		mm5, [edi+72]
+		paddd		mm2, mm4
+		paddd		mm3, mm5
+
+		movq		mm4, mm0
+		punpckldq	mm0, mm1
+		movq		mm5, mm2
+		punpckldq	mm2, mm3
+		punpckhdq	mm4, mm1
+		punpckhdq	mm5, mm3
+		paddd		mm0, mm4
+		paddd		mm2, mm5
+		psrad		mm0, 14
+		psrad		mm2, 14
+
+		packssdw	mm0, mm2
+		packuswb	mm0, mm0
+
+		add			edi, 80
+
+		movd		[ebp], mm0
+		add			ebp, 4
+		sub			esi, 1
+		jne			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_k12_4x_MMX(void *dst, const void *src, uint32 width, const void *kernel) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			ebp, [esp +  4 + 16]		;ebp = dst
+		mov			esi, [esp + 12 + 16]		;esi = width
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+yloop:
+		;eax = temp
+		;ebx = temp
+		;ecx = temp
+		;edx = temp
+		;esi = horiz counter
+		;edi = filter list
+		;ebp = destination
+
+		mov			eax, [edi+0]
+		mov			ebx, [edi+4]
+		mov			ecx, [edi+8]
+		mov			edx, [esp+8+16]
+		add			eax, edx
+		add			ebx, edx
+		add			ecx, edx
+		add			edx, [edi+12]
+
+		movd		mm0, [eax]
+		punpcklbw	mm0, mm7
+
+		pmaddwd		mm0, [edi+16]
+		movd		mm1, [ebx]
+		punpcklbw	mm1, mm7
+
+		pmaddwd		mm1, [edi+24]
+		movd		mm2, [ecx]
+		punpcklbw	mm2, mm7
+
+		pmaddwd		mm2, [edi+32]
+		movd		mm3, [edx]
+		punpcklbw	mm3, mm7
+
+		pmaddwd		mm3, [edi+40]
+		movd		mm4, [eax+4]
+		punpcklbw	mm4, mm7
+
+		pmaddwd		mm4, [edi+48]
+		movd		mm5, [ebx+4]
+		punpcklbw	mm5, mm7
+
+		pmaddwd		mm5, [edi+56]
+		paddd		mm0, mm4
+		paddd		mm1, mm5
+
+		movd		mm4, [ecx+4]
+		punpcklbw	mm4, mm7
+		movd		mm5, [edx+4]
+
+		pmaddwd		mm4, [edi+64]
+		punpcklbw	mm5, mm7
+		paddd		mm2, mm4
+
+		pmaddwd		mm5, [edi+72]
+		movd		mm4, [eax+8]
+		punpcklbw	mm4, mm7
+
+		paddd		mm3, mm5
+		movd		mm5, [ebx+8]
+		punpcklbw	mm5, mm7
+
+		pmaddwd		mm4, [edi+80]
+		paddd		mm0, mm4
+		movd		mm4, [ecx+8]
+
+		pmaddwd		mm5, [edi+88]
+		paddd		mm1, mm5
+		punpcklbw	mm4, mm7
+
+		pmaddwd		mm4, [edi+96]
+		movd		mm5, [edx+8]
+		punpcklbw	mm5, mm7
+
+		pmaddwd		mm5, [edi+104]
+		paddd		mm2, mm4
+		paddd		mm3, mm5
+
+		movq		mm4, mm0
+		punpckldq	mm0, mm1
+		movq		mm5, mm2
+		punpckldq	mm2, mm3
+		punpckhdq	mm4, mm1
+		punpckhdq	mm5, mm3
+		paddd		mm0, mm4
+		paddd		mm2, mm5
+		paddd		mm0, mm6
+		paddd		mm2, mm6
+		psrad		mm0, 14
+		psrad		mm2, 14
+
+		packssdw	mm0, mm2
+		packuswb	mm0, mm0
+
+		add			edi, 112
+
+		movd		[ebp], mm0
+		add			ebp, 4
+		sub			esi, 1
+		jne			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_MMX(void *dst, const void *src, uint32 width, const void *kernel, uint32 kwidth) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000000000002000;
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			edi, [esp +  4 + 16]		;edi = dst
+		mov			ebx, [esp +  8 + 16]		;ebx = src
+		mov			ebp, [esp + 12 + 16]		;ebp = width
+		mov			edx, [esp + 16 + 16]		;edx = kernel
+yloop:
+		;eax = temp
+		;ebx = source base address
+		;ecx = (temp) source
+		;edx = filter list
+		;esi = (temp) kernel width
+		;edi = destination
+		;ebp = horiz counter
+
+		mov			eax, [edx]
+		add			edx, 8
+		lea			ecx, [ebx + eax]
+		mov			esi, [esp + 20 + 16]		;esi = kernel width
+
+		movq		mm2, mm6
+xloop:
+		movd		mm0, [ecx]
+		punpcklbw	mm0, mm7
+		add			ecx, 4
+		pmaddwd		mm0, [edx]
+		paddd		mm2, mm0
+		add			edx, 8
+		sub			esi, 4
+		jne			xloop
+
+		punpckldq	mm0, mm2
+		paddd		mm0, mm2
+		psrad		mm0, 14
+		psrlq		mm0, 32
+		packssdw	mm0, mm0
+		packuswb	mm0, mm0
+		movd		eax, mm0
+		mov			[edi], al
+		add			edi, 1
+		sub			ebp, 1
+		jne			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Process(void *dst, const void *src, uint32 w) {
+	int byteOffset = (int)(ptrdiff_t)src & 3;
+	const sint16 *ksrc = &mRowKernels[mRowKernelSize * byteOffset];
+#if 0
+	int kwidth = mAlignedKernelWidth;
+	uint8 *dst2 = (uint8 *)dst;
+
+	do {
+		int offset = ksrc[0];
+		ksrc += 4;
+		
+		const uint8 *src2 = (const uint8 *)src + offset;
+		sint32 accum = 0x8000;
+		for(int i=0; i<kwidth; ++i) {
+			accum += (sint32)src2[i] * (*ksrc++);
+		}
+
+		accum >>= 14;
+
+		accum &= ~(accum >> 31);
+		accum |= (255 - accum) >> 31;
+
+		*dst2++ = (uint8)accum;
+
+	} while(--w);
+#else
+	int ksize = mKernelSizeByOffset[byteOffset];
+	if (mbQuadOptimizationEnabled[byteOffset]) {
+		if (w >= 4) {
+			if (ksize == 12) {
+				vdasm_resize_table_row_8_k12_4x_MMX(dst, src, w >> 2, ksrc);
+
+#if 0
+				int w4 = w >> 2;
+				uint8 *dst2 = (uint8 *)dst;
+				const uint8 *src2 = (const uint8 *)src;
+				const sint16 *ksrc2 = ksrc;
+
+				do {
+					int off0 = ksrc2[0];
+					int off1 = ksrc2[2];
+					int off2 = ksrc2[4];
+					int off3 = ksrc2[6];
+					const uint8 *d0 = src2 + off0;
+					const uint8 *d1 = src2 + off1;
+					const uint8 *d2 = src2 + off2;
+					const uint8 *d3 = src2 + off3;
+
+					int acc0 = 0;
+					int acc1 = 0;
+					int acc2 = 0;
+					int acc3 = 0;
+
+					acc0 += d0[ 0]*ksrc2[   8]
+						  + d0[ 1]*ksrc2[   9]
+						  + d0[ 2]*ksrc2[  10]
+						  + d0[ 3]*ksrc2[  11]
+						  + d0[ 4]*ksrc2[  24]
+						  + d0[ 5]*ksrc2[  25]
+						  + d0[ 6]*ksrc2[  26]
+						  + d0[ 7]*ksrc2[  27]
+						  + d0[ 8]*ksrc2[  40]
+						  + d0[ 9]*ksrc2[  41]
+						  + d0[10]*ksrc2[  42]
+						  + d0[11]*ksrc2[  43];
+
+					acc0 = (acc0 + 0x2000) >> 14;
+					if (acc0 < 0) acc0 = 0; else if (acc0 > 255) acc0 = 255;
+
+					acc1 += d1[ 0]*ksrc2[  12]
+						  + d1[ 1]*ksrc2[  13]
+						  + d1[ 2]*ksrc2[  14]
+						  + d1[ 3]*ksrc2[  15]
+						  + d1[ 4]*ksrc2[  28]
+						  + d1[ 5]*ksrc2[  29]
+						  + d1[ 6]*ksrc2[  30]
+						  + d1[ 7]*ksrc2[  31]
+						  + d1[ 8]*ksrc2[  44]
+						  + d1[ 9]*ksrc2[  45]
+						  + d1[10]*ksrc2[  46]
+						  + d1[11]*ksrc2[  47];
+
+					acc1 = (acc1 + 0x2000) >> 14;
+					if (acc1 < 0) acc1 = 0; else if (acc1 > 255) acc1 = 255;
+
+					acc2 += d2[ 0]*ksrc2[  16]
+						  + d2[ 1]*ksrc2[  17]
+						  + d2[ 2]*ksrc2[  18]
+						  + d2[ 3]*ksrc2[  19]
+						  + d2[ 4]*ksrc2[  32]
+						  + d2[ 5]*ksrc2[  33]
+						  + d2[ 6]*ksrc2[  34]
+						  + d2[ 7]*ksrc2[  35]
+						  + d2[ 8]*ksrc2[  48]
+						  + d2[ 9]*ksrc2[  49]
+						  + d2[10]*ksrc2[  50]
+						  + d2[11]*ksrc2[  51];
+
+					acc2 = (acc2 + 0x2000) >> 14;
+					if (acc2 < 0) acc2 = 0; else if (acc2 > 255) acc2 = 255;
+
+					acc3 += d3[ 0]*ksrc2[  20]
+						  + d3[ 1]*ksrc2[  21]
+						  + d3[ 2]*ksrc2[  22]
+						  + d3[ 3]*ksrc2[  23]
+						  + d3[ 4]*ksrc2[  36]
+						  + d3[ 5]*ksrc2[  37]
+						  + d3[ 6]*ksrc2[  38]
+						  + d3[ 7]*ksrc2[  39]
+						  + d3[ 8]*ksrc2[  52]
+						  + d3[ 9]*ksrc2[  53]
+						  + d3[10]*ksrc2[  54]
+						  + d3[11]*ksrc2[  55];
+
+					acc3 = (acc3 + 0x2000) >> 14;
+					if (acc3 < 0) acc3 = 0; else if (acc3 > 255) acc3 = 255;
+
+					ksrc2 += 56;
+
+					dst2[0] = (uint8)acc0;
+					dst2[1] = (uint8)acc1;
+					dst2[2] = (uint8)acc2;
+					dst2[3] = (uint8)acc3;
+					dst2 += 4;
+				} while(--w4);
+#endif
+			} else
+				vdasm_resize_table_row_8_k8_4x_MMX(dst, src, w >> 2, ksrc);
+		}
+
+		if (w & 3)
+			vdasm_resize_table_row_8_MMX((char *)dst + (w & ~3), src, w & 3, ksrc + mTailOffset[byteOffset], ksize);
+	} else {
+		vdasm_resize_table_row_8_MMX(dst, src, w, ksrc, ksize);
+	}
+#endif
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableRowStageMMX::VDResamplerSeparableTableRowStageMMX(const IVDResamplerFilter& filter)
+	: VDResamplerRowStageSeparableTable32(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableRowStageMMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStage8MMX::VDResamplerSeparableTableColStage8MMX(const IVDResamplerFilter& filter)
+	: VDResamplerColStageSeparableTable8(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_k2_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			esi, [esp +  4 + 16]		;esi = dst
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+		mov			ebp, [esp + 12 + 16]		;ebp = width
+
+		movq		mm5, [edi]
+
+		mov			edx, [esp +  8 + 16]		;ebx = srcs
+		mov			eax, [edx+0]
+		mov			ebx, [edx+4]
+		add			eax, ebp
+		add			ebx, ebp
+		neg			ebp
+yloop:
+		;eax = row0
+		;ebx = row1
+		;ecx =
+		;edx =
+		;edi = kernel
+		;esi = dest
+		;ebp = width counter
+
+		movd		mm0, [eax+ebp]
+		punpcklbw	mm0, mm7
+		movd		mm2, [ebx+ebp]
+		punpcklbw	mm2, mm7
+		movq		mm1, mm0
+		punpcklwd	mm0, mm2
+		punpckhwd	mm1, mm2
+		pmaddwd		mm0, mm5
+		pmaddwd		mm1, mm5
+
+		paddd		mm0, mm6
+		paddd		mm1, mm6
+
+		psrad		mm0, 14
+		psrad		mm1, 14
+		packssdw	mm0, mm1
+		packuswb	mm0, mm0
+		movd		[esi], mm0
+		add			esi, 4
+		add			ebp, 4
+		jne			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_k4_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			esi, [esp +  4 + 16]		;esi = dst
+		mov			edi, [esp + 16 + 16]		;edi = kernel
+		xor			ebp, ebp
+
+		mov			edx, [esp +  8 + 16]		;ebx = srcs
+		mov			eax, [edx+0]
+		mov			ebx, [edx+4]
+		mov			ecx, [edx+8]
+		mov			edx, [edx+12]
+yloop:
+		;eax = row0
+		;ebx = row1
+		;ecx = row2
+		;edx = row3
+		;edi = kernel
+		;esi = dest
+		;ebp = width counter
+
+		movd		mm0, [eax+ebp]
+		punpcklbw	mm0, mm7
+		movd		mm2, [ebx+ebp]
+		punpcklbw	mm2, mm7
+		movq		mm1, mm0
+		punpcklwd	mm0, mm2
+		movq		mm5, [edi]
+		punpckhwd	mm1, mm2
+		pmaddwd		mm0, mm5
+		pmaddwd		mm1, mm5
+
+		paddd		mm0, mm6
+		paddd		mm1, mm6
+
+		movd		mm3, [ecx+ebp]
+		punpcklbw	mm3, mm7
+		movd		mm2, [edx+ebp]
+		punpcklbw	mm2, mm7
+		movq		mm4, mm3
+		punpcklwd	mm3, mm2
+		movq		mm5, [edi+8]
+		punpckhwd	mm4, mm2
+		pmaddwd		mm3, mm5
+		pmaddwd		mm4, mm5
+
+		paddd		mm0, mm3
+		paddd		mm1, mm4
+
+		psrad		mm0, 14
+		psrad		mm1, 14
+		packssdw	mm0, mm1
+		packuswb	mm0, mm0
+		add			ebp, 4
+		movd		[esi], mm0
+		add			esi, 4
+		cmp			ebp, [esp + 12 + 16]
+		jb			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel, uint32 kwidth) {
+	static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+	__asm {
+		push		ebp
+		push		edi
+		push		esi
+		push		ebx
+
+		pxor		mm7, mm7
+		movq		mm6, kRound
+
+		mov			edi, [esp +  4 + 16]		;edi = dst
+		xor			ebp, ebp
+yloop:
+		mov			edx, [esp + 16 + 16]		;edx = kernel
+		mov			ebx, [esp +  8 + 16]		;ebx = srcs
+		mov			esi, [esp + 20 + 16]		;esi = kwidth
+		movq		mm3, mm6
+		movq		mm4, mm6
+xloop:
+		mov			ecx, [ebx]
+		movd		mm0, [ecx+ebp]
+		punpcklbw	mm0, mm7
+		mov			ecx, [ebx+4]
+		movd		mm2, [ecx+ebp]
+		punpcklbw	mm2, mm7
+		movq		mm1, mm0
+		punpcklwd	mm0, mm2
+		punpckhwd	mm1, mm2
+		movq		mm5, [edx]
+		pmaddwd		mm0, mm5
+		pmaddwd		mm1, mm5
+
+		paddd		mm3, mm0
+		paddd		mm4, mm1
+		add			ebx, 8
+		add			edx, 8
+		sub			esi, 2
+		jne			xloop
+
+		psrad		mm3, 14
+		psrad		mm4, 14
+		packssdw	mm3, mm4
+		packuswb	mm3, mm3
+		movd		[edi], mm3
+		add			edi, 4
+		add			ebp, 4
+		cmp			ebp, [esp + 12 + 16]
+		jb			yloop
+
+		pop			ebx
+		pop			esi
+		pop			edi
+		pop			ebp
+		ret
+	}
+}
+
+void VDResamplerSeparableTableColStage8MMX::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *const *src = (const uint8 *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const sint16 *filter = (const sint16 *)&mFilterBank[((phase>>8)&0xff) * ksize];
+
+	int w4 = w & ~3;
+
+	if (w4) {
+		switch(ksize) {
+			case 2:
+				vdasm_resize_table_col_8_k2_MMX(dst, (const void *const *)src, w4, filter);
+				break;
+
+			case 4:
+				vdasm_resize_table_col_8_k4_MMX(dst, (const void *const *)src, w4, filter);
+				break;
+
+			default:
+				vdasm_resize_table_col_8_MMX(dst, (const void *const *)src, w4, filter, ksize);
+				break;
+		}
+	}
+
+	for(uint32 i=w4; i<w; ++i) {
+		int b = 0x2000;
+		const sint16 *filter2 = filter;
+		const uint8 *const *src2 = src;
+
+		for(unsigned j = ksize; j; j -= 2) {
+			sint32 p0 = (*src2++)[i];
+			sint32 p1 = (*src2++)[i];
+			sint32 coeff0 = filter2[0];
+			sint32 coeff1 = filter2[1];
+			filter2 += 4;
+
+			b += p0*coeff0;
+			b += p1*coeff1;
+		}
+
+		b >>= 14;
+
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		dst[i] = (uint8)b;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStageMMX::VDResamplerSeparableTableColStageMMX(const IVDResamplerFilter& filter)
+	: VDResamplerColStageSeparableTable32(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStageMMX::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+	vdasm_resize_table_col_MMX((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, (phase >> 8) & 0xff);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// resampler stages (SSE2, x86)
+//
+///////////////////////////////////////////////////////////////////////////
+
+extern "C" long vdasm_resize_table_col_SSE2(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w, long frac);
+extern "C" long vdasm_resize_table_row_SSE2(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+extern "C" void vdasm_resize_ccint_col_SSE2(void *dst, const void *src1, const void *src2, const void *src3, const void *src4, uint32 count, const void *tbl);
+
+VDResamplerSeparableCubicColStageSSE2::VDResamplerSeparableCubicColStageSSE2(double A)
+	: VDResamplerSeparableCubicColStageMMX(A)
+{
+}
+
+void VDResamplerSeparableCubicColStageSSE2::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+	vdasm_resize_ccint_col_SSE2(dst0, srcarray[0], srcarray[1], srcarray[2], srcarray[3], w, mFilterBank.data() + ((phase>>6)&0x3fc));
+}
+
+VDResamplerSeparableTableRowStageSSE2::VDResamplerSeparableTableRowStageSSE2(const IVDResamplerFilter& filter)
+	: VDResamplerSeparableTableRowStageMMX(filter)
+{
+}
+
+void VDResamplerSeparableTableRowStageSSE2::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableColStageSSE2::VDResamplerSeparableTableColStageSSE2(const IVDResamplerFilter& filter)
+	: VDResamplerSeparableTableColStageMMX(filter)
+{
+}
+
+void VDResamplerSeparableTableColStageSSE2::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+	vdasm_resize_table_col_SSE2((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, (phase >> 8) & 0xff);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// resampler stages (SSE4.1, x86)
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableRowStage8SSE41::VDResamplerSeparableTableRowStage8SSE41(const IVDResamplerFilter& filter)
+	: VDResamplerRowStageSeparableTable32(filter)
+	, mLastSrcWidth(0)
+	, mLastDstWidth(0)
+	, mLastU(0)
+	, mLastDUDX(0)
+{
+	mAlignedKernelWidth = (GetWindowSize() + 15) & ~7;
+	mAlignedKernelSize = mAlignedKernelWidth + 16;
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Init(const VDResamplerAxis& axis, uint32 srcw) {
+	uint32 w = axis.dx_preclip + axis.dx_active + axis.dx_postclip + axis.dx_dualclip;
+
+	if (mLastSrcWidth != srcw || mLastDstWidth != w || mLastU != axis.u || mLastDUDX != axis.dudx) {
+		mLastSrcWidth	= srcw;
+		mLastDstWidth	= w;
+		mLastU			= axis.u;
+		mLastDUDX		= axis.dudx;
+
+		RedoRowFilters(axis, w, srcw);
+	}
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::RedoRowFilters(const VDResamplerAxis& axis, uint32 w, uint32 srcw) {
+	int kstride = mFilterBank.size() >> 8;
+	int ksize = mAlignedKernelWidth;
+	int kesize = mAlignedKernelSize;
+
+	mRowKernels.clear();
+	mRowKernelSize = w * kesize;
+
+	mRowKernels.resize(mRowKernelSize * 8, 0);
+
+	for(int byteOffset = 0; byteOffset < 8; ++byteOffset) {
+		sint16 *dst = mRowKernels.data() + mRowKernelSize * byteOffset;
+		int ksizeThisOffset = std::min<int>(ksize, (byteOffset + srcw + 7) & ~7);
+
+		mKernelSizeByOffset[byteOffset] = ksizeThisOffset;
+
+		sint32 u = axis.u;
+		sint32 uoffmin = -byteOffset;
+		sint32 uoffmax = ((srcw + byteOffset + 7) & ~7) - byteOffset - ksizeThisOffset;
+		for(uint32 i=0; i<w; ++i) {
+			sint32 uoffset = u >> 16;
+			sint32 uoffset2 = ((uoffset + byteOffset) & ~7) - byteOffset;
+
+			if (uoffset2 < uoffmin)
+				uoffset2 = uoffmin;
+
+			if (uoffset2 > uoffmax)
+				uoffset2 = uoffmax;
+
+			*(sint32 *)dst = uoffset2;
+			dst += 2;
+			*dst++ = 0;
+			*dst++ = 0;
+			*dst++ = 0;
+			*dst++ = 0;
+			*dst++ = 0;
+			*dst++ = 0;
+
+			uint32 phase = (u >> 8) & 255;
+			const sint32 *src = &mFilterBank[kstride * phase];
+
+			sint32 start = 0;
+			sint32 end = kstride;
+
+			int dstoffset = uoffset - uoffset2;
+
+			// check for filter kernel overlapping left source boundary
+			if (uoffset < 0)
+				start = -uoffset;
+
+			// check for filter kernel overlapping right source boundary
+			if (uoffset + end > (sint32)srcw)
+				end = srcw - uoffset;
+
+			VDASSERT(dstoffset + start >= 0);
+			VDASSERT(dstoffset + end <= ksizeThisOffset);
+
+			sint16 *dst2 = dst + dstoffset;
+			dst += ksizeThisOffset;
+
+			for(int j=start; j<end; ++j)
+				dst2[j] = src[j];
+
+			if (start > 0)
+				dst2[start] = std::accumulate(src, src+start, dst2[start]);
+
+			if (end < kstride)
+				dst2[end - 1] = std::accumulate(src+end, src+kstride, dst2[end - 1]);
+
+			u += axis.dudx;
+		}
+	}
+
+	// swizzle rows where optimization is possible
+	vdfastvector<sint16> temp;
+
+	int quads = w >> 2;
+	int quadRemainder = w & 3;
+
+	for(int byteOffset = 0; byteOffset < 8; ++byteOffset) {
+		int ksizeThisOffset = mKernelSizeByOffset[byteOffset];
+		int kpairs = ksizeThisOffset >> 3;
+
+		if (ksizeThisOffset < 8 || ksizeThisOffset > 16) {
+			mbQuadOptimizationEnabled[byteOffset] = false;
+		} else {
+			ptrdiff_t unswizzledStride = (ksizeThisOffset >> 1) + 4;
+
+			mbQuadOptimizationEnabled[byteOffset] = true;
+			mTailOffset[byteOffset] = quads * (8 + ksizeThisOffset*4);
+
+			uint32 *dst = (uint32 *)&mRowKernels[mRowKernelSize * byteOffset];
+			temp.resize(mRowKernelSize);
+			memcpy(temp.data(), dst, mRowKernelSize*2);
+
+			const uint32 *src0 = (const uint32 *)temp.data();
+			const uint32 *src1 = src0 + unswizzledStride;
+			const uint32 *src2 = src1 + unswizzledStride;
+			const uint32 *src3 = src2 + unswizzledStride;
+			ptrdiff_t srcskip = unswizzledStride * 3;
+
+			for(int q = 0; q < quads; ++q) {
+				dst[0] = src0[0];
+				dst[1] = src1[0];
+				dst[2] = src2[0];
+				dst[3] = src3[0];
+				src0 += 4;
+				src1 += 4;
+				src2 += 4;
+				src3 += 4;
+				dst += 4;
+
+				for(int p = 0; p < kpairs; ++p) {
+					dst[ 0] = src0[0];
+					dst[ 1] = src0[1];
+					dst[ 2] = src0[2];
+					dst[ 3] = src0[3];
+					dst[ 4] = src1[0];
+					dst[ 5] = src1[1];
+					dst[ 6] = src1[2];
+					dst[ 7] = src1[3];
+					dst[ 8] = src2[0];
+					dst[ 9] = src2[1];
+					dst[10] = src2[2];
+					dst[11] = src2[3];
+					dst[12] = src3[0];
+					dst[13] = src3[1];
+					dst[14] = src3[2];
+					dst[15] = src3[3];
+					dst += 16;
+					src0 += 4;
+					src1 += 4;
+					src2 += 4;
+					src3 += 4;
+				}
+
+				src0 += srcskip;
+				src1 += srcskip;
+				src2 += srcskip;
+				src3 += srcskip;
+			}
+
+			memcpy(dst, src0, unswizzledStride * 4 * quadRemainder);
+		}
+	}
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Process(void *dst, const void *src, uint32 w) {
+	int byteOffset = (int)(ptrdiff_t)src & 7;
+	const sint16 *ksrc = &mRowKernels[mRowKernelSize * byteOffset];
+
+	int ksize = mKernelSizeByOffset[byteOffset];
+	if (mbQuadOptimizationEnabled[byteOffset]) {
+		if (w >= 4) {
+			if (ksize == 16)
+				vdasm_resize_table_row_8_k16_4x_SSE41(dst, src, w >> 2, ksrc);
+			else
+				vdasm_resize_table_row_8_k8_4x_SSE41(dst, src, w >> 2, ksrc);
+		}
+
+		if (w & 3)
+			vdasm_resize_table_row_8_SSE41((char *)dst + (w & ~3), src, w & 3, ksrc + mTailOffset[byteOffset], ksize);
+	} else {
+		vdasm_resize_table_row_8_SSE41(dst, src, w, ksrc, ksize);
+	}
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+	vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStage8SSE41::VDResamplerSeparableTableColStage8SSE41(const IVDResamplerFilter& filter)
+	: VDResamplerColStageSeparableTable8(filter)
+{
+	VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStage8SSE41::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+	uint8 *dst = (uint8 *)dst0;
+	const uint8 *const *src = (const uint8 *const *)src0;
+	const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+	const sint16 *filter = (const sint16 *)&mFilterBank[((phase>>8)&0xff) * ksize];
+
+	int w4 = w & ~3;
+
+	if (w4) {
+		switch(ksize) {
+			case 2:
+				vdasm_resize_table_col_8_k2_SSE41(dst, (const void *const *)src, w4, filter);
+				break;
+
+			case 4:
+				vdasm_resize_table_col_8_k4_SSE41(dst, (const void *const *)src, w4, filter);
+				break;
+
+			default:
+				vdasm_resize_table_col_8_MMX(dst, (const void *const *)src, w4, filter, ksize);
+				break;
+		}
+	}
+
+	for(uint32 i=w4; i<w; ++i) {
+		int b = 0x2000;
+		const sint16 *filter2 = filter;
+		const uint8 *const *src2 = src;
+
+		for(unsigned j = ksize; j; j -= 2) {
+			sint32 p0 = (*src2++)[i];
+			sint32 p1 = (*src2++)[i];
+			sint32 coeff0 = filter2[0];
+			sint32 coeff1 = filter2[1];
+			filter2 += 4;
+
+			b += p0*coeff0;
+			b += p1*coeff1;
+		}
+
+		b >>= 14;
+
+		if ((uint32)b >= 0x00000100)
+			b = ~b >> 31;
+
+		dst[i] = (uint8)b;
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp
new file mode 100644
index 000000000..3afdec910
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp
@@ -0,0 +1,816 @@
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+namespace {
+	struct VDPixmapReferenceStretchBltParameters {
+		void *dst;
+		ptrdiff_t	dstpitch;
+		const void *src;
+		ptrdiff_t	srcpitch;
+		ptrdiff_t	srcinc;
+		sint32		dx;
+		sint32		dy;
+		uint32		u;
+		uint32		uinc;
+		uint32		dudx;
+		uint32		v;
+		uint32		dvdy;
+		sint32		xprecopy;
+		sint32		xpostcopy;
+		ptrdiff_t	xprepos;
+		ptrdiff_t	xpostpos;
+
+		void advance() {
+			dst = (char *)dst + dstpitch;
+			src = (char *)src + srcinc;
+
+			uint32 vt = v + dvdy;
+
+			if (vt < v)
+				src = (char *)src + srcpitch;
+
+			v = vt;
+		}
+	};
+}
+
+void VDPixmapStretchBlt_Any8_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+	do {
+		uint8 *dstp = (uint8 *)params.dst;
+		const uint8 *srcp = (const uint8 *)params.src;
+		uint32 u = params.u;
+
+		if (params.xprecopy) {
+			VDMemset8(dstp, *(const uint8 *)((const char *)params.src + params.xprepos), params.xprecopy);
+			dstp += params.xprecopy;
+		}
+
+		sint32 wt = params.dx;
+
+		if (wt > 0)
+			do {
+				*dstp++ = *srcp;
+
+				uint32 ut = u + params.dudx;
+				srcp += ut<u;
+				srcp += params.uinc;
+				u = ut;
+			} while(--wt);
+
+		if (params.xpostcopy)
+			VDMemset8(dstp, *(const uint8 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+		params.advance();
+	} while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any16_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+	do {
+		uint16 *dstp = (uint16 *)params.dst;
+		const uint16 *srcp = (const uint16 *)params.src;
+		uint32 u = params.u;
+
+		if (params.xprecopy) {
+			VDMemset16(dstp, *(const uint16 *)((const char *)params.src + params.xprepos), params.xprecopy);
+			dstp += params.xprecopy;
+		}
+
+		sint32 wt = params.dx;
+
+		if (wt > 0)
+			do {
+				*dstp++ = *srcp;
+
+				uint32 ut = u + params.dudx;
+				srcp += ut<u;
+				srcp += params.uinc;
+				u = ut;
+			} while(--wt);
+
+		if (params.xpostcopy)
+			VDMemset16(dstp, *(const uint16 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+		params.advance();
+	} while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any24_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+	do {
+		uint8 *dstp = (uint8 *)params.dst;
+		const uint8 *srcp = (const uint8 *)params.src;
+		uint32 u = params.u;
+
+		if (params.xprecopy) {
+			const uint8 *repsrc = (const uint8 *)params.src + params.xprepos;
+			const uint8 p0 = repsrc[0];
+			const uint8 p1 = repsrc[1];
+			const uint8 p2 = repsrc[2];
+
+			for(sint32 i=0; i<params.xprecopy; ++i) {
+				dstp[0] = p0;
+				dstp[1] = p1;
+				dstp[2] = p2;
+				dstp += 3;
+			}
+		}
+
+		sint32 wt = params.dx;
+
+		if (wt > 0)
+			do {
+				dstp[0] = srcp[0];
+				dstp[1] = srcp[1];
+				dstp[2] = srcp[2];
+				dstp += 3;
+
+				uint32 ut = u + params.dudx;
+				srcp += (ut<u)*3;
+				srcp += params.uinc*3;
+				u = ut;
+			} while(--wt);
+
+		if (params.xpostcopy) {
+			const uint8 *repsrc = (const uint8 *)params.src + params.xpostpos;
+			const uint8 p0 = repsrc[0];
+			const uint8 p1 = repsrc[1];
+			const uint8 p2 = repsrc[2];
+
+			for(sint32 i=0; i<params.xpostcopy; ++i) {
+				dstp[0] = p0;
+				dstp[1] = p1;
+				dstp[2] = p2;
+				dstp += 3;
+			}
+		}
+
+		params.advance();
+	} while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any32_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+	do {
+		uint32 *dstp = (uint32 *)params.dst;
+		const uint32 *srcp = (const uint32 *)params.src;
+		uint32 u = params.u;
+
+		if (params.xprecopy) {
+			VDMemset32(dstp, *(const uint32 *)((const char *)params.src + params.xprepos), params.xprecopy);
+			dstp += params.xprecopy;
+		}
+
+		sint32 wt = params.dx;
+		if (wt > 0)
+			do {
+				*dstp++ = *srcp;
+
+				uint32 ut = u + params.dudx;
+				srcp += ut<u;
+				srcp += params.uinc;
+				u = ut;
+			} while(--wt);
+
+		if (params.xpostcopy)
+			VDMemset32(dstp, *(const uint32 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+		params.advance();
+	} while(--params.dy);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+	void VDSetupNearestSamplingParameters(sint64& u64, sint64 dudx, sint32 dx, sint32 du, sint32& xprecopy, sint32& xprepos, sint32& xmain, sint32& xpostcopy, sint32& xpostpos) {
+		sint64 ulo = u64;
+		sint64 uhi = u64 + dudx * (dx - 1);
+		sint64 tdudx = dudx;
+		const sint64 ulimit = ((sint64)du << 32);
+
+		xprepos = 0;
+		xpostpos = du-1;
+
+		if (!tdudx) {
+			if (u64 < 0)
+				xprecopy = dx;
+			else if (u64 >= ulimit)
+				xprecopy = dx;
+			else
+				xmain = dx;
+		} else {
+			if (tdudx < 0) {
+				std::swap(ulo, uhi);
+				tdudx = -tdudx;
+			}
+
+			if (ulo < 0) {
+				if (uhi < 0)
+					xprecopy = dx;
+				else
+					xprecopy = (sint32)((-ulo-1) / tdudx) + 1;
+
+				VDASSERT(xprecopy <= 0 || (uint64)ulo >= (uint64)ulimit);
+				VDASSERT(xprecopy <= 0 || (uint64)(ulo + tdudx * (xprecopy-1)) >= (uint64)ulimit);
+			}
+
+			if (uhi >= ulimit) {
+				if (ulo >= ulimit)
+					xpostcopy = dx;
+				else
+					xpostcopy = (sint32)((uhi - ulimit) / tdudx) + 1;
+
+				VDASSERT(xpostcopy <= 0 || (uint64)uhi >= (uint64)ulimit);
+				VDASSERT(xpostcopy <= 0 || (uint64)(uhi - tdudx * (xpostcopy - 1)) >= (uint64)ulimit);
+			}
+
+			if (dudx < 0) {
+				std::swap(xprecopy, xpostcopy);
+				std::swap(xprepos, xpostpos);
+			}
+
+			xmain = dx - (xprecopy + xpostcopy);
+		}
+
+		// sanity-check parameters
+
+		VDASSERT(xprecopy>=0 && xprecopy <= dx);
+		VDASSERT(xpostcopy>=0 && xpostcopy <= dx);
+		VDASSERT(xmain>=0 && xmain <= dx);
+
+		VDASSERT(xprecopy <= 0 || (uint64)u64 >= (uint64)ulimit);
+		VDASSERT(xprecopy <= 0 || (uint64)(u64 + dudx * (xprecopy-1)) >= (uint64)ulimit);
+		VDASSERT(xmain <= 0 || (uint64)(u64 + dudx * xprecopy) < (uint64)ulimit);
+		VDASSERT(xmain <= 0 || (uint64)(u64 + dudx * (xprecopy+xmain-1)) < (uint64)ulimit);
+		VDASSERT(xpostcopy <= 0 || (uint64)(u64 + dudx * (xprecopy + xmain)) >= (uint64)ulimit);
+		VDASSERT(xpostcopy <= 0 || (uint64)(u64 + dudx * (xprecopy + xmain + xpostcopy - 1)) >= (uint64)ulimit);
+
+		u64 += dudx * xprecopy;
+	}
+}
+
+bool VDPixmapStretchBltNearest_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+	// we don't support format conversion
+	if (dst.format != src.format)
+		return false;
+
+	void (*pBlitter)(VDPixmapReferenceStretchBltParameters);
+	int bpp;
+
+	switch(src.format) {
+	case nsVDPixmap::kPixFormat_Pal8:
+		pBlitter = VDPixmapStretchBlt_Any8_nearest_reference;
+		bpp = 1;
+		break;
+	case nsVDPixmap::kPixFormat_XRGB1555:
+	case nsVDPixmap::kPixFormat_RGB565:
+		pBlitter = VDPixmapStretchBlt_Any16_nearest_reference;
+		bpp = 2;
+		break;
+	case nsVDPixmap::kPixFormat_RGB888:
+		pBlitter = VDPixmapStretchBlt_Any24_nearest_reference;
+		bpp = 3;
+		break;
+	case nsVDPixmap::kPixFormat_XRGB8888:
+		pBlitter = VDPixmapStretchBlt_Any32_nearest_reference;
+		bpp = 4;
+		break;
+	default:
+		return false;
+	}
+
+	// preemptive clip to prevent gradient calculations from crashing
+	if (x2 == x1 || y2 == y1)
+		return true;
+
+	// translate destination flips into source flips
+	if (x1 > x2) {
+		std::swap(x1, x2);
+		std::swap(u1, u2);
+	}
+
+	if (y1 > y2) {
+		std::swap(y1, y2);
+		std::swap(v1, v2);
+	}
+
+	// compute gradients
+	sint32 dx	= x2 - x1;
+	sint32 dy	= y2 - y1;
+	sint32 du	= u2 - u1;
+	sint32 dv	= v2 - v1;
+	sint64 dudx = ((sint64)du << 32) / dx;		// must truncate toward zero to prevent overflow
+	sint64 dvdy = ((sint64)dv << 32) / dy;
+
+	// prestep top-left point to pixel center and convert destination coordinates to integer
+	sint64 u64 = (sint64)u1 << 16;
+	sint64 v64 = (sint64)v1 << 16;
+	sint32 prestepx = (0x8000 - x1) & 0xffff;
+	sint32 prestepy = (0x8000 - y1) & 0xffff;
+
+	u64 += (dudx * prestepx) >> 16;
+	v64 += (dvdy * prestepy) >> 16;
+
+	sint32 x1i = (x1 + 0x8000) >> 16;
+	sint32 y1i = (y1 + 0x8000) >> 16;
+	sint32 x2i = (x2 + 0x8000) >> 16;
+	sint32 y2i = (y2 + 0x8000) >> 16;
+
+	// destination clipping
+	if (x1i < 0) {
+		u64 -= dudx * x1i;
+		x1i = 0;
+	}
+
+	if (y1i < 0) {
+		v64 -= dvdy * y1i;
+		y1i = 0;
+	}
+
+	if (x2i > dst.w)
+		x2i = dst.w;
+
+	if (y2i > dst.h)
+		y2i = dst.h;
+
+	if (x1i >= x2i || y1i >= y2i)
+		return true;
+
+	// Calculate horizontal clip parameters
+	sint32 xprecopy = 0, xpostcopy = 0;
+	int xprepos = 0;
+	int xpostpos = src.w-1;
+	int xmain = 0;
+
+	VDSetupNearestSamplingParameters(u64, dudx, x2i-x1i, src.w, xprecopy, xprepos, xmain, xpostcopy, xpostpos);
+
+	// Calculate vertical clip parameters
+	sint32 yprecopy = 0, ypostcopy = 0;
+	int yprepos = 0;
+	int ypostpos = src.h-1;
+	int ymain = 0;
+
+	VDSetupNearestSamplingParameters(v64, dvdy, y2i-y1i, src.h, yprecopy, yprepos, ymain, ypostcopy, ypostpos);
+
+	// set up parameter block
+	VDPixmapReferenceStretchBltParameters params;
+
+	char *srcbase = (char *)src.data + (sint32)(u64 >> 32) * bpp;
+
+	params.dst			= (char *)dst.data + y1i * dst.pitch + x1i * bpp;
+	params.dstpitch		= dst.pitch;
+	params.src			= srcbase + (sint32)(v64 >> 32) * src.pitch;
+	params.srcpitch		= src.pitch;
+	params.srcinc		= (sint32)(dvdy >> 32) * src.pitch;
+	params.dx			= xmain;
+	params.dy			= ymain;
+	params.u			= (uint32)u64;
+	params.uinc			= (uint32)(dudx >> 32);
+	params.dudx			= (uint32)dudx;
+	params.v			= (uint32)v64;
+	params.dvdy			= (uint32)dvdy;
+	params.xprecopy		= xprecopy;
+	params.xprepos		= (xprepos - (sint32)(u64 >> 32)) * bpp;
+	params.xpostcopy	= xpostcopy;
+	params.xpostpos		= (xpostpos - (sint32)(u64 >> 32)) * bpp;
+
+	if (yprecopy > 0) {
+		VDPixmapReferenceStretchBltParameters preparams(params);
+
+		preparams.src		= srcbase + yprepos * src.pitch;
+		preparams.srcinc	= 0;
+		preparams.dy		= yprecopy;
+		preparams.v			= 0;
+		preparams.dvdy		= 0;
+
+		pBlitter(preparams);
+
+		params.dst		= (char *)params.dst + params.dstpitch * yprecopy;
+	}
+
+	if (ymain > 0)
+		pBlitter(params);
+
+	if (ypostcopy > 0) {
+		VDPixmapReferenceStretchBltParameters postparams(params);
+
+		postparams.dst		= (char *)params.dst + params.dstpitch * params.dy;
+		postparams.src		= srcbase + ypostpos * src.pitch;
+		postparams.srcpitch	= 0;
+		postparams.srcinc	= 0;
+		postparams.dy		= ypostcopy;
+		postparams.v		= 0;
+		postparams.dvdy		= 0;
+
+		pBlitter(postparams);
+	}
+	return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+	uint32 lerp_XRGB1555(sint32 a, sint32 b, sint32 f) {
+		sint32 a_rb	= a & 0x7c1f;
+		sint32 a_g	= a & 0x03e0;
+		sint32 b_rb	= b & 0x7c1f;
+		sint32 b_g	= b & 0x03e0;
+
+		const sint32 rb = (a_rb + (((b_rb - a_rb)*f + 0x4010) >> 5)) & 0x7c1f;
+		const sint32 g  = (a_g  + (((b_g  - a_g )*f + 0x0200) >> 5)) & 0x03e0;
+
+		return rb + g;
+	}
+
+	uint32 lerp_XRGB8888(sint32 a, sint32 b, sint32 f) {
+		sint32 a_rb	= a & 0xff00ff;
+		sint32 a_g	= a & 0x00ff00;
+		sint32 b_rb	= b & 0xff00ff;
+		sint32 b_g	= b & 0x00ff00;
+
+		const uint32 rb = (a_rb + (((b_rb - a_rb)*f + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 g  = (a_g  + (((b_g  - a_g )*f + 0x00008000) >> 8)) & 0x00ff00;
+
+		return rb + g;
+	}
+
+	uint32 bilerp_RGB888(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+		sint32 a_rb	= a & 0xff00ff;
+		sint32 a_g	= a & 0x00ff00;
+		sint32 b_rb	= b & 0xff00ff;
+		sint32 b_g	= b & 0x00ff00;
+		sint32 c_rb	= c & 0xff00ff;
+		sint32 c_g	= c & 0x00ff00;
+		sint32 d_rb	= d & 0xff00ff;
+		sint32 d_g	= d & 0x00ff00;
+
+		const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 top_g  = (a_g  + (((b_g  - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+		const uint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 bot_g  = (c_g  + (((d_g  - c_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+		const uint32 final_rb = (top_rb + (((bot_rb - top_rb)*y) >> 8)) & 0xff00ff;
+		const uint32 final_g  = (top_g  + (((bot_g  - top_g )*y) >> 8)) & 0x00ff00;
+
+		return final_rb + final_g;
+	}
+
+	uint32 bilerp_XRGB1555(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+		sint32 a_rb	= a & 0x7c1f;
+		sint32 a_g	= a & 0x03e0;
+		sint32 b_rb	= b & 0x7c1f;
+		sint32 b_g	= b & 0x03e0;
+		sint32 c_rb	= c & 0x7c1f;
+		sint32 c_g	= c & 0x03e0;
+		sint32 d_rb	= d & 0x7c1f;
+		sint32 d_g	= d & 0x03e0;
+
+		const sint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x4010) >> 5)) & 0x7c1f;
+		const sint32 top_g  = (a_g  + (((b_g  - a_g )*x + 0x0200) >> 5)) & 0x03e0;
+		const sint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x4010) >> 5)) & 0x7c1f;
+		const sint32 bot_g  = (c_g  + (((d_g  - c_g )*x + 0x0200) >> 5)) & 0x03e0;
+
+		const sint32 final_rb = (top_rb + (((bot_rb - top_rb)*y + 0x4010) >> 5)) & 0x7c1f;
+		const sint32 final_g  = (top_g  + (((bot_g  - top_g )*y + 0x0200) >> 5)) & 0x03e0;
+
+		return final_rb + final_g;
+	}
+
+	uint32 bilerp_RGB565(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+		sint32 a_rb	= a & 0xf81f;
+		sint32 a_g	= a & 0x07e0;
+		sint32 b_rb	= b & 0xf81f;
+		sint32 b_g	= b & 0x07e0;
+		sint32 c_rb	= c & 0xf81f;
+		sint32 c_g	= c & 0x07e0;
+		sint32 d_rb	= d & 0xf81f;
+		sint32 d_g	= d & 0x07e0;
+
+		const sint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x8010) >> 6)) & 0xf81f;
+		const sint32 top_g  = (a_g  + (((b_g  - a_g )*x + 0x0400) >> 6)) & 0x07e0;
+		const sint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x8010) >> 6)) & 0xf81f;
+		const sint32 bot_g  = (c_g  + (((d_g  - c_g )*x + 0x0400) >> 6)) & 0x07e0;
+
+		const sint32 final_rb = (top_rb + (((bot_rb - top_rb)*y + 0x8010) >> 6)) & 0xf81f;
+		const sint32 final_g  = (top_g  + (((bot_g  - top_g )*y + 0x0400) >> 6)) & 0x07e0;
+
+		return final_rb + final_g;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+	struct VDPixmapReferenceStretchBltBilinearParameters {
+		void		*dst;
+		const void	*src;
+		uint32		u;
+		uint32		uinc;
+		uint32		dudx;
+
+		ptrdiff_t	xprepos;
+		ptrdiff_t	xpostpos;
+		sint32		xprecopy;
+		sint32		xpostcopy;
+		sint32		xmidsize;
+	};
+
+	void VDPixmapStretchBiH_XRGB1555_to_XRGB1555(const VDPixmapReferenceStretchBltBilinearParameters& params) {
+		uint16 *dst = (uint16 *)params.dst;
+		const uint16 *src = (const uint16 *)params.src;
+
+		if (params.xprecopy)
+			VDMemset16(dst - params.xprecopy, *(const uint16 *)((const char *)params.src + params.xprepos), params.xprecopy);
+
+		if (params.xmidsize) {
+			sint32 w = params.xmidsize;
+			uint32 u = params.u;
+			const uint32 dudx = params.dudx;
+			const ptrdiff_t uinc = params.uinc;
+
+			do {
+				*dst++ = lerp_XRGB1555(src[0], src[1], u >> 27);
+
+				const uint32 ut = u + dudx;
+				src += uinc + (ut < u);
+				u = ut;
+			} while(--w);
+		}
+
+		if (params.xpostcopy)
+			VDMemset16(dst, *(const uint16 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+	}
+
+	void VDPixmapStretchBiH_XRGB8888_to_XRGB8888(const VDPixmapReferenceStretchBltBilinearParameters& params) {
+		uint32 *dst = (uint32 *)params.dst;
+		const uint32 *src = (const uint32 *)params.src;
+
+		if (params.xprecopy)
+			VDMemset32(dst - params.xprecopy, *(const uint32 *)((const char *)params.src + params.xprepos), params.xprecopy);
+
+		if (params.xmidsize) {
+			sint32 w = params.xmidsize;
+			uint32 u = params.u;
+			const uint32 dudx = params.dudx;
+			const ptrdiff_t uinc = params.uinc;
+
+			do {
+				*dst++ = lerp_XRGB8888(src[0], src[1], u >> 24);
+
+				const uint32 ut = u + dudx;
+				src += uinc + (ut < u);
+				u = ut;
+			} while(--w);
+		}
+
+		if (params.xpostcopy)
+			VDMemset32(dst, *(const uint32 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+	}
+
+	void VDPixmapStretchBiV_XRGB1555_to_XRGB1555(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f) {
+		uint16 *dst = (uint16 *)dstv;
+		const uint16 *src1 = (const uint16 *)src1v;
+		const uint16 *src2 = (const uint16 *)src2v;
+
+		f >>= 27;
+
+		do {
+			*dst++ = lerp_XRGB1555(*src1++, *src2++, f);
+		} while(--w);
+	}
+
+	void VDPixmapStretchBiV_XRGB8888_to_XRGB8888(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f) {
+		uint32 *dst = (uint32 *)dstv;
+		const uint32 *src1 = (const uint32 *)src1v;
+		const uint32 *src2 = (const uint32 *)src2v;
+
+		f >>= 24;
+
+		do {
+			*dst++ = lerp_XRGB8888(*src1++, *src2++, f);
+		} while(--w);
+	}
+}
+
+#ifdef _M_IX86
+extern "C" void vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX(const VDPixmapReferenceStretchBltBilinearParameters&);
+
+extern "C" void vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+extern "C" void vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+#endif
+
+bool VDPixmapStretchBltBilinear_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+	// preemptive clip to prevent gradient calculations from crashing
+	if (x2 == x1 || y2 == y1)
+		return true;
+
+	// we don't support source clipping
+	if ((uint32)u1 > (uint32)(src.w << 16) || (uint32)v1 > (uint32)(src.h << 16))
+		return false;
+
+	if ((uint32)u2 > (uint32)(src.w << 16) || (uint32)v2 > (uint32)(src.h << 16))
+		return false;
+
+	// we don't support format changes (yet)
+	if (dst.format != src.format)
+		return false;
+
+	// format determination
+	void (*pHorizontalFilter)(const VDPixmapReferenceStretchBltBilinearParameters& params);
+	void (*pVerticalFilter)(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+	int bpp;
+
+#pragma vdpragma_TODO("fixme this is b0rken")
+	switch(src.format) {
+	case nsVDPixmap::kPixFormat_XRGB1555:
+		pHorizontalFilter = VDPixmapStretchBiH_XRGB1555_to_XRGB1555;
+#ifdef _M_IX86
+		if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX)
+			pVerticalFilter = vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX;
+		else
+#endif
+			pVerticalFilter = VDPixmapStretchBiV_XRGB1555_to_XRGB1555;
+		bpp = 2;
+		break;
+	case nsVDPixmap::kPixFormat_XRGB8888:
+#ifdef _M_IX86
+		if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX) {
+			pHorizontalFilter = vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX;
+			pVerticalFilter = vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX;
+		} else
+#endif
+		{
+			pHorizontalFilter = VDPixmapStretchBiH_XRGB8888_to_XRGB8888;
+			pVerticalFilter = VDPixmapStretchBiV_XRGB8888_to_XRGB8888;
+		}
+		bpp = 4;
+		break;
+	default:
+		return false;
+	}
+
+	// translate destination flips into source flips
+	if (x1 > x2) {
+		std::swap(x1, x2);
+		std::swap(u1, u2);
+	}
+
+	if (y1 > y2) {
+		std::swap(y1, y2);
+		std::swap(v1, v2);
+	}
+
+	// compute gradients
+	sint32 dx	= x2 - x1;
+	sint32 dy	= y2 - y1;
+	sint32 du	= u2 - u1;
+	sint32 dv	= v2 - v1;
+	sint64 dudx = ((sint64)du << 32) / dx;		// must truncate toward zero to prevent overflow
+	sint64 dvdy = ((sint64)dv << 32) / dy;
+
+	// prestep top-left point to pixel center and convert destination coordinates to integer
+	sint64 u64 = (sint64)u1 << 16;
+	sint64 v64 = (sint64)v1 << 16;
+	sint32 prestepx = (0x8000 - x1) & 0xffff;
+	sint32 prestepy = (0x8000 - y1) & 0xffff;
+
+	u64 += (dudx * prestepx) >> 16;
+	v64 += (dvdy * prestepy) >> 16;
+
+	sint32 x1i = (x1 + 0x8000) >> 16;
+	sint32 y1i = (y1 + 0x8000) >> 16;
+	sint32 x2i = (x2 + 0x8000) >> 16;
+	sint32 y2i = (y2 + 0x8000) >> 16;
+
+	// destination clipping
+	if (x1i < 0) {
+		u64 -= dudx * x1i;
+		x1i = 0;
+	}
+
+	if (y1i < 0) {
+		v64 -= dvdy * y1i;
+		y1i = 0;
+	}
+
+	if (x2i > dst.w)
+		x2i = dst.w;
+
+	if (y2i > dst.h)
+		y2i = dst.h;
+
+	if (x1i >= x2i || y1i >= y2i)
+		return true;
+
+	u64 -= 0x80000000;
+	v64 -= 0x80000000;
+
+	int xprepos = 0;
+	int xpostpos = src.w-1;
+
+	sint64 ulo = u64;
+	sint64 uhi = u64 + dudx * (x2i - x1i - 1);
+	sint64 tdudx = dudx;
+
+	if (ulo > uhi) {
+		std::swap(ulo, uhi);
+		tdudx = -tdudx;
+	}
+
+	int xprecopy = 0;
+	int xpostcopy = 0;
+
+	if (ulo < 0) {
+		xprecopy = (int)((1 - ulo) / tdudx) + 1;
+	}
+
+	const sint64 ulimit = ((sint64)(src.w-1) << 32);
+
+	if (uhi >= ulimit)
+		xpostcopy = (int)((uhi - ulimit - 1) / tdudx) + 1;
+
+	if (dudx < 0) {
+		std::swap(xprecopy, xpostcopy);
+		std::swap(xprepos, xpostpos);
+	}
+
+	u64 += dudx * xprecopy;
+	const int xtotal	= x2i - x1i;
+	int xmidcopy = (x2i - x1i) - (xprecopy + xpostcopy);
+	const sint32 ui = (sint32)(u64 >> 32);
+
+	// set up parameter block
+
+	VDPixmapReferenceStretchBltBilinearParameters params;
+
+	params.u			= (uint32)u64;
+	params.uinc			= (sint32)(dudx >> 32);
+	params.dudx			= (sint32)dudx;
+	params.xprecopy		= xprecopy;
+	params.xprepos		= (xprepos - ui) * bpp;
+	params.xpostcopy	= xpostcopy;
+	params.xpostpos		= (xpostpos - ui) * bpp;
+	params.xmidsize		= xmidcopy;
+
+	void *dstp			= (char *)dst.data + y1i * dst.pitch + x1i * bpp;
+	const void *srcp	= (char *)src.data + ui * bpp;
+
+	VDPixmapBuffer		window(xtotal, 2, src.format);
+
+	void *pTempRow1 = window.data;
+	void *pTempRow2 = (char *)window.data + window.pitch;
+	int windowbottom = dvdy > 0 ? -0x7fffffff : 0x7fffffff;
+
+	do {
+		sint32 iv = (sint32)(v64 >> 32);
+		sint32 iv_bottom = iv + 1;
+
+		if (iv < 0)
+			iv = iv_bottom = 0;
+
+		if (iv >= src.h-1)
+			iv = iv_bottom = src.h-1;
+
+		if (dvdy < 0) {
+			if (windowbottom > iv_bottom+1)
+				windowbottom = iv_bottom+1;
+
+			while(windowbottom > iv) {
+				std::swap(pTempRow1, pTempRow2);
+
+				--windowbottom;
+
+				params.dst		= (char *)pTempRow1 + bpp * params.xprecopy;
+				params.src		= vdptroffset(srcp, windowbottom * src.pitch);
+
+				pHorizontalFilter(params);
+			}
+		} else {
+			if (windowbottom < iv-1)
+				windowbottom = iv-1;
+
+			while(windowbottom < iv_bottom) {
+				std::swap(pTempRow1, pTempRow2);
+
+				++windowbottom;
+
+				params.dst		= (char *)pTempRow2 + bpp * params.xprecopy;
+				params.src		= vdptroffset(srcp, windowbottom * src.pitch);
+
+				pHorizontalFilter(params);
+			}
+		}
+
+		if (iv == iv_bottom)
+			if (dvdy < 0)
+				pVerticalFilter(dstp, pTempRow1, pTempRow1, xtotal, 0);
+			else
+				pVerticalFilter(dstp, pTempRow2, pTempRow2, xtotal, 0);
+		else
+			pVerticalFilter(dstp, pTempRow1, pTempRow2, xtotal, (uint32)v64);
+
+		v64 += dvdy;
+		dstp = (char *)dstp + dst.pitch;
+	} while(++y1i < y2i);
+
+	return true;
+}
+\ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp b/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp
new file mode 100644
index 000000000..bf1987500
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp
@@ -0,0 +1,204 @@
+// Automatically generated by Asuka 'maketables.'" DO NOT EDIT!
+
+#include <vd2/system/vdtypes.h>
+
+extern "C" const sint32 kVDCubicInterpTableFX14_075[256][4]={
+	{      0,  16384,      0,      0 },	{    -48,  16384,     48,      0 },	{    -95,  16383,     97,     -1 },	{   -141,  16380,    147,     -2 },
+	{   -186,  16375,    198,     -3 },	{   -231,  16371,    249,     -5 },	{   -275,  16365,    301,     -7 },	{   -318,  16357,    354,     -9 },
+	{   -360,  16349,    407,    -12 },	{   -402,  16340,    461,    -15 },	{   -443,  16329,    516,    -18 },	{   -484,  16318,    572,    -22 },
+	{   -523,  16305,    628,    -26 },	{   -562,  16291,    685,    -30 },	{   -601,  16278,    742,    -35 },	{   -638,  16262,    800,    -40 },
+	{   -675,  16245,    859,    -45 },	{   -711,  16228,    918,    -51 },	{   -747,  16209,    978,    -56 },	{   -782,  16190,   1039,    -63 },
+	{   -816,  16169,   1100,    -69 },	{   -849,  16147,   1162,    -76 },	{   -882,  16124,   1225,    -83 },	{   -915,  16101,   1288,    -90 },
+	{   -946,  16077,   1351,    -98 },	{   -977,  16052,   1415,   -106 },	{  -1007,  16025,   1480,   -114 },	{  -1037,  15998,   1545,   -122 },
+	{  -1066,  15970,   1611,   -131 },	{  -1094,  15940,   1678,   -140 },	{  -1122,  15910,   1745,   -149 },	{  -1149,  15879,   1812,   -158 },
+	{  -1176,  15848,   1880,   -168 },	{  -1202,  15815,   1949,   -178 },	{  -1227,  15781,   2018,   -188 },	{  -1252,  15747,   2087,   -198 },
+	{  -1276,  15712,   2157,   -209 },	{  -1300,  15676,   2228,   -220 },	{  -1323,  15639,   2299,   -231 },	{  -1345,  15601,   2370,   -242 },
+	{  -1367,  15562,   2442,   -253 },	{  -1388,  15523,   2514,   -265 },	{  -1409,  15482,   2587,   -276 },	{  -1429,  15441,   2660,   -288 },
+	{  -1448,  15399,   2734,   -301 },	{  -1467,  15356,   2808,   -313 },	{  -1486,  15312,   2883,   -325 },	{  -1504,  15268,   2958,   -338 },
+	{  -1521,  15223,   3033,   -351 },	{  -1538,  15177,   3109,   -364 },	{  -1554,  15130,   3185,   -377 },	{  -1570,  15084,   3261,   -391 },
+	{  -1585,  15035,   3338,   -404 },	{  -1600,  14986,   3416,   -418 },	{  -1614,  14936,   3493,   -431 },	{  -1627,  14885,   3571,   -445 },
+	{  -1641,  14834,   3650,   -459 },	{  -1653,  14783,   3728,   -474 },	{  -1665,  14730,   3807,   -488 },	{  -1677,  14676,   3887,   -502 },
+	{  -1688,  14623,   3966,   -517 },	{  -1699,  14568,   4046,   -531 },	{  -1709,  14512,   4127,   -546 },	{  -1719,  14457,   4207,   -561 },
+	{  -1728,  14400,   4288,   -576 },	{  -1737,  14343,   4369,   -591 },	{  -1745,  14284,   4451,   -606 },	{  -1753,  14226,   4532,   -621 },
+	{  -1760,  14167,   4614,   -637 },	{  -1767,  14107,   4696,   -652 },	{  -1774,  14047,   4779,   -668 },	{  -1780,  13986,   4861,   -683 },
+	{  -1785,  13924,   4944,   -699 },	{  -1791,  13861,   5028,   -714 },	{  -1795,  13798,   5111,   -730 },	{  -1800,  13736,   5194,   -746 },
+	{  -1804,  13671,   5278,   -761 },	{  -1807,  13606,   5362,   -777 },	{  -1810,  13541,   5446,   -793 },	{  -1813,  13475,   5531,   -809 },
+	{  -1815,  13409,   5615,   -825 },	{  -1817,  13342,   5700,   -841 },	{  -1818,  13275,   5784,   -857 },	{  -1819,  13207,   5869,   -873 },
+	{  -1820,  13139,   5954,   -889 },	{  -1820,  13069,   6040,   -905 },	{  -1820,  13000,   6125,   -921 },	{  -1820,  12930,   6211,   -937 },
+	{  -1819,  12860,   6296,   -953 },	{  -1818,  12789,   6382,   -969 },	{  -1816,  12717,   6468,   -985 },	{  -1815,  12647,   6553,  -1001 },
+	{  -1812,  12574,   6639,  -1017 },	{  -1810,  12502,   6725,  -1033 },	{  -1807,  12427,   6812,  -1048 },	{  -1804,  12354,   6898,  -1064 },
+	{  -1800,  12280,   6984,  -1080 },	{  -1796,  12206,   7070,  -1096 },	{  -1792,  12130,   7157,  -1111 },	{  -1787,  12055,   7243,  -1127 },
+	{  -1782,  11980,   7329,  -1143 },	{  -1777,  11903,   7416,  -1158 },	{  -1772,  11827,   7502,  -1173 },	{  -1766,  11751,   7588,  -1189 },
+	{  -1760,  11673,   7675,  -1204 },	{  -1753,  11595,   7761,  -1219 },	{  -1747,  11517,   7848,  -1234 },	{  -1740,  11439,   7934,  -1249 },
+	{  -1733,  11361,   8020,  -1264 },	{  -1725,  11281,   8107,  -1279 },	{  -1717,  11202,   8193,  -1294 },	{  -1709,  11123,   8279,  -1309 },
+	{  -1701,  11043,   8365,  -1323 },	{  -1692,  10962,   8451,  -1337 },	{  -1684,  10883,   8537,  -1352 },	{  -1675,  10802,   8623,  -1366 },
+	{  -1665,  10720,   8709,  -1380 },	{  -1656,  10640,   8794,  -1394 },	{  -1646,  10557,   8880,  -1407 },	{  -1636,  10476,   8965,  -1421 },
+	{  -1626,  10393,   9051,  -1434 },	{  -1615,  10311,   9136,  -1448 },	{  -1604,  10228,   9221,  -1461 },	{  -1594,  10146,   9306,  -1474 },
+	{  -1582,  10062,   9391,  -1487 },	{  -1571,   9979,   9475,  -1499 },	{  -1560,   9896,   9560,  -1512 },	{  -1548,   9812,   9644,  -1524 },
+	{  -1536,   9728,   9728,  -1536 },	{  -1524,   9644,   9812,  -1548 },	{  -1512,   9560,   9896,  -1560 },	{  -1499,   9475,   9979,  -1571 },
+	{  -1487,   9391,  10062,  -1582 },	{  -1474,   9306,  10146,  -1594 },	{  -1461,   9221,  10228,  -1604 },	{  -1448,   9136,  10311,  -1615 },
+	{  -1434,   9051,  10393,  -1626 },	{  -1421,   8965,  10476,  -1636 },	{  -1407,   8880,  10557,  -1646 },	{  -1394,   8795,  10639,  -1656 },
+	{  -1380,   8709,  10720,  -1665 },	{  -1366,   8624,  10801,  -1675 },	{  -1352,   8538,  10882,  -1684 },	{  -1337,   8450,  10963,  -1692 },
+	{  -1323,   8365,  11043,  -1701 },	{  -1309,   8279,  11123,  -1709 },	{  -1294,   8192,  11203,  -1717 },	{  -1279,   8106,  11282,  -1725 },
+	{  -1264,   8020,  11361,  -1733 },	{  -1249,   7934,  11439,  -1740 },	{  -1234,   7847,  11518,  -1747 },	{  -1219,   7760,  11596,  -1753 },
+	{  -1204,   7675,  11673,  -1760 },	{  -1189,   7589,  11750,  -1766 },	{  -1173,   7502,  11827,  -1772 },	{  -1158,   7415,  11904,  -1777 },
+	{  -1143,   7329,  11980,  -1782 },	{  -1127,   7243,  12055,  -1787 },	{  -1111,   7156,  12131,  -1792 },	{  -1096,   7070,  12206,  -1796 },
+	{  -1080,   6984,  12280,  -1800 },	{  -1064,   6898,  12354,  -1804 },	{  -1048,   6811,  12428,  -1807 },	{  -1033,   6726,  12501,  -1810 },
+	{  -1017,   6639,  12574,  -1812 },	{  -1001,   6554,  12646,  -1815 },	{   -985,   6467,  12718,  -1816 },	{   -969,   6382,  12789,  -1818 },
+	{   -953,   6296,  12860,  -1819 },	{   -937,   6211,  12930,  -1820 },	{   -921,   6125,  13000,  -1820 },	{   -905,   6039,  13070,  -1820 },
+	{   -889,   5954,  13139,  -1820 },	{   -873,   5869,  13207,  -1819 },	{   -857,   5784,  13275,  -1818 },	{   -841,   5700,  13342,  -1817 },
+	{   -825,   5615,  13409,  -1815 },	{   -809,   5531,  13475,  -1813 },	{   -793,   5446,  13541,  -1810 },	{   -777,   5362,  13606,  -1807 },
+	{   -761,   5278,  13671,  -1804 },	{   -746,   5195,  13735,  -1800 },	{   -730,   5111,  13798,  -1795 },	{   -714,   5028,  13861,  -1791 },
+	{   -699,   4944,  13924,  -1785 },	{   -683,   4862,  13985,  -1780 },	{   -668,   4780,  14046,  -1774 },	{   -652,   4696,  14107,  -1767 },
+	{   -637,   4614,  14167,  -1760 },	{   -621,   4532,  14226,  -1753 },	{   -606,   4450,  14285,  -1745 },	{   -591,   4369,  14343,  -1737 },
+	{   -576,   4288,  14400,  -1728 },	{   -561,   4207,  14457,  -1719 },	{   -546,   4126,  14513,  -1709 },	{   -531,   4046,  14568,  -1699 },
+	{   -517,   3966,  14623,  -1688 },	{   -502,   3886,  14677,  -1677 },	{   -488,   3807,  14730,  -1665 },	{   -474,   3728,  14783,  -1653 },
+	{   -459,   3650,  14834,  -1641 },	{   -445,   3570,  14886,  -1627 },	{   -431,   3493,  14936,  -1614 },	{   -418,   3416,  14986,  -1600 },
+	{   -404,   3338,  15035,  -1585 },	{   -391,   3262,  15083,  -1570 },	{   -377,   3185,  15130,  -1554 },	{   -364,   3109,  15177,  -1538 },
+	{   -351,   3033,  15223,  -1521 },	{   -338,   2958,  15268,  -1504 },	{   -325,   2882,  15313,  -1486 },	{   -313,   2808,  15356,  -1467 },
+	{   -301,   2734,  15399,  -1448 },	{   -288,   2660,  15441,  -1429 },	{   -276,   2587,  15482,  -1409 },	{   -265,   2514,  15523,  -1388 },
+	{   -253,   2442,  15562,  -1367 },	{   -242,   2370,  15601,  -1345 },	{   -231,   2299,  15639,  -1323 },	{   -220,   2228,  15676,  -1300 },
+	{   -209,   2157,  15712,  -1276 },	{   -198,   2087,  15747,  -1252 },	{   -188,   2017,  15782,  -1227 },	{   -178,   1949,  15815,  -1202 },
+	{   -168,   1880,  15848,  -1176 },	{   -158,   1811,  15880,  -1149 },	{   -149,   1744,  15911,  -1122 },	{   -140,   1677,  15941,  -1094 },
+	{   -131,   1611,  15970,  -1066 },	{   -122,   1545,  15998,  -1037 },	{   -114,   1480,  16025,  -1007 },	{   -106,   1415,  16052,   -977 },
+	{    -98,   1351,  16077,   -946 },	{    -90,   1288,  16101,   -915 },	{    -83,   1224,  16125,   -882 },	{    -76,   1162,  16147,   -849 },
+	{    -69,   1100,  16169,   -816 },	{    -63,   1040,  16189,   -782 },	{    -56,    978,  16209,   -747 },	{    -51,    919,  16227,   -711 },
+	{    -45,    859,  16245,   -675 },	{    -40,    800,  16262,   -638 },	{    -35,    743,  16277,   -601 },	{    -30,    684,  16292,   -562 },
+	{    -26,    628,  16305,   -523 },	{    -22,    572,  16318,   -484 },	{    -18,    516,  16329,   -443 },	{    -15,    462,  16339,   -402 },
+	{    -12,    407,  16349,   -360 },	{     -9,    354,  16357,   -318 },	{     -7,    302,  16364,   -275 },	{     -5,    250,  16370,   -231 },
+	{     -3,    198,  16375,   -186 },	{     -2,    148,  16379,   -141 },	{     -1,     98,  16382,    -95 },	{      0,     49,  16383,    -48 },
+};
+
+#ifdef _M_IX86
+extern "C" const __declspec(align(16)) sint16 kVDCubicInterpTableFX14_075_MMX[256][8]={
+	{      0,  16384,      0,  16384,      0,      0,      0,      0 },	{    -48,  16384,    -48,  16384,     48,      0,     48,      0 },
+	{    -95,  16383,    -95,  16383,     97,     -1,     97,     -1 },	{   -141,  16380,   -141,  16380,    147,     -2,    147,     -2 },
+	{   -186,  16375,   -186,  16375,    198,     -3,    198,     -3 },	{   -231,  16371,   -231,  16371,    249,     -5,    249,     -5 },
+	{   -275,  16365,   -275,  16365,    301,     -7,    301,     -7 },	{   -318,  16357,   -318,  16357,    354,     -9,    354,     -9 },
+	{   -360,  16349,   -360,  16349,    407,    -12,    407,    -12 },	{   -402,  16340,   -402,  16340,    461,    -15,    461,    -15 },
+	{   -443,  16329,   -443,  16329,    516,    -18,    516,    -18 },	{   -484,  16318,   -484,  16318,    572,    -22,    572,    -22 },
+	{   -523,  16305,   -523,  16305,    628,    -26,    628,    -26 },	{   -562,  16291,   -562,  16291,    685,    -30,    685,    -30 },
+	{   -601,  16278,   -601,  16278,    742,    -35,    742,    -35 },	{   -638,  16262,   -638,  16262,    800,    -40,    800,    -40 },
+	{   -675,  16245,   -675,  16245,    859,    -45,    859,    -45 },	{   -711,  16228,   -711,  16228,    918,    -51,    918,    -51 },
+	{   -747,  16209,   -747,  16209,    978,    -56,    978,    -56 },	{   -782,  16190,   -782,  16190,   1039,    -63,   1039,    -63 },
+	{   -816,  16169,   -816,  16169,   1100,    -69,   1100,    -69 },	{   -849,  16147,   -849,  16147,   1162,    -76,   1162,    -76 },
+	{   -882,  16124,   -882,  16124,   1225,    -83,   1225,    -83 },	{   -915,  16101,   -915,  16101,   1288,    -90,   1288,    -90 },
+	{   -946,  16077,   -946,  16077,   1351,    -98,   1351,    -98 },	{   -977,  16052,   -977,  16052,   1415,   -106,   1415,   -106 },
+	{  -1007,  16025,  -1007,  16025,   1480,   -114,   1480,   -114 },	{  -1037,  15998,  -1037,  15998,   1545,   -122,   1545,   -122 },
+	{  -1066,  15970,  -1066,  15970,   1611,   -131,   1611,   -131 },	{  -1094,  15940,  -1094,  15940,   1678,   -140,   1678,   -140 },
+	{  -1122,  15910,  -1122,  15910,   1745,   -149,   1745,   -149 },	{  -1149,  15879,  -1149,  15879,   1812,   -158,   1812,   -158 },
+	{  -1176,  15848,  -1176,  15848,   1880,   -168,   1880,   -168 },	{  -1202,  15815,  -1202,  15815,   1949,   -178,   1949,   -178 },
+	{  -1227,  15781,  -1227,  15781,   2018,   -188,   2018,   -188 },	{  -1252,  15747,  -1252,  15747,   2087,   -198,   2087,   -198 },
+	{  -1276,  15712,  -1276,  15712,   2157,   -209,   2157,   -209 },	{  -1300,  15676,  -1300,  15676,   2228,   -220,   2228,   -220 },
+	{  -1323,  15639,  -1323,  15639,   2299,   -231,   2299,   -231 },	{  -1345,  15601,  -1345,  15601,   2370,   -242,   2370,   -242 },
+	{  -1367,  15562,  -1367,  15562,   2442,   -253,   2442,   -253 },	{  -1388,  15523,  -1388,  15523,   2514,   -265,   2514,   -265 },
+	{  -1409,  15482,  -1409,  15482,   2587,   -276,   2587,   -276 },	{  -1429,  15441,  -1429,  15441,   2660,   -288,   2660,   -288 },
+	{  -1448,  15399,  -1448,  15399,   2734,   -301,   2734,   -301 },	{  -1467,  15356,  -1467,  15356,   2808,   -313,   2808,   -313 },
+	{  -1486,  15312,  -1486,  15312,   2883,   -325,   2883,   -325 },	{  -1504,  15268,  -1504,  15268,   2958,   -338,   2958,   -338 },
+	{  -1521,  15223,  -1521,  15223,   3033,   -351,   3033,   -351 },	{  -1538,  15177,  -1538,  15177,   3109,   -364,   3109,   -364 },
+	{  -1554,  15130,  -1554,  15130,   3185,   -377,   3185,   -377 },	{  -1570,  15084,  -1570,  15084,   3261,   -391,   3261,   -391 },
+	{  -1585,  15035,  -1585,  15035,   3338,   -404,   3338,   -404 },	{  -1600,  14986,  -1600,  14986,   3416,   -418,   3416,   -418 },
+	{  -1614,  14936,  -1614,  14936,   3493,   -431,   3493,   -431 },	{  -1627,  14885,  -1627,  14885,   3571,   -445,   3571,   -445 },
+	{  -1641,  14834,  -1641,  14834,   3650,   -459,   3650,   -459 },	{  -1653,  14783,  -1653,  14783,   3728,   -474,   3728,   -474 },
+	{  -1665,  14730,  -1665,  14730,   3807,   -488,   3807,   -488 },	{  -1677,  14676,  -1677,  14676,   3887,   -502,   3887,   -502 },
+	{  -1688,  14623,  -1688,  14623,   3966,   -517,   3966,   -517 },	{  -1699,  14568,  -1699,  14568,   4046,   -531,   4046,   -531 },
+	{  -1709,  14512,  -1709,  14512,   4127,   -546,   4127,   -546 },	{  -1719,  14457,  -1719,  14457,   4207,   -561,   4207,   -561 },
+	{  -1728,  14400,  -1728,  14400,   4288,   -576,   4288,   -576 },	{  -1737,  14343,  -1737,  14343,   4369,   -591,   4369,   -591 },
+	{  -1745,  14284,  -1745,  14284,   4451,   -606,   4451,   -606 },	{  -1753,  14226,  -1753,  14226,   4532,   -621,   4532,   -621 },
+	{  -1760,  14167,  -1760,  14167,   4614,   -637,   4614,   -637 },	{  -1767,  14107,  -1767,  14107,   4696,   -652,   4696,   -652 },
+	{  -1774,  14047,  -1774,  14047,   4779,   -668,   4779,   -668 },	{  -1780,  13986,  -1780,  13986,   4861,   -683,   4861,   -683 },
+	{  -1785,  13924,  -1785,  13924,   4944,   -699,   4944,   -699 },	{  -1791,  13861,  -1791,  13861,   5028,   -714,   5028,   -714 },
+	{  -1795,  13798,  -1795,  13798,   5111,   -730,   5111,   -730 },	{  -1800,  13736,  -1800,  13736,   5194,   -746,   5194,   -746 },
+	{  -1804,  13671,  -1804,  13671,   5278,   -761,   5278,   -761 },	{  -1807,  13606,  -1807,  13606,   5362,   -777,   5362,   -777 },
+	{  -1810,  13541,  -1810,  13541,   5446,   -793,   5446,   -793 },	{  -1813,  13475,  -1813,  13475,   5531,   -809,   5531,   -809 },
+	{  -1815,  13409,  -1815,  13409,   5615,   -825,   5615,   -825 },	{  -1817,  13342,  -1817,  13342,   5700,   -841,   5700,   -841 },
+	{  -1818,  13275,  -1818,  13275,   5784,   -857,   5784,   -857 },	{  -1819,  13207,  -1819,  13207,   5869,   -873,   5869,   -873 },
+	{  -1820,  13139,  -1820,  13139,   5954,   -889,   5954,   -889 },	{  -1820,  13069,  -1820,  13069,   6040,   -905,   6040,   -905 },
+	{  -1820,  13000,  -1820,  13000,   6125,   -921,   6125,   -921 },	{  -1820,  12930,  -1820,  12930,   6211,   -937,   6211,   -937 },
+	{  -1819,  12860,  -1819,  12860,   6296,   -953,   6296,   -953 },	{  -1818,  12789,  -1818,  12789,   6382,   -969,   6382,   -969 },
+	{  -1816,  12717,  -1816,  12717,   6468,   -985,   6468,   -985 },	{  -1815,  12647,  -1815,  12647,   6553,  -1001,   6553,  -1001 },
+	{  -1812,  12574,  -1812,  12574,   6639,  -1017,   6639,  -1017 },	{  -1810,  12502,  -1810,  12502,   6725,  -1033,   6725,  -1033 },
+	{  -1807,  12427,  -1807,  12427,   6812,  -1048,   6812,  -1048 },	{  -1804,  12354,  -1804,  12354,   6898,  -1064,   6898,  -1064 },
+	{  -1800,  12280,  -1800,  12280,   6984,  -1080,   6984,  -1080 },	{  -1796,  12206,  -1796,  12206,   7070,  -1096,   7070,  -1096 },
+	{  -1792,  12130,  -1792,  12130,   7157,  -1111,   7157,  -1111 },	{  -1787,  12055,  -1787,  12055,   7243,  -1127,   7243,  -1127 },
+	{  -1782,  11980,  -1782,  11980,   7329,  -1143,   7329,  -1143 },	{  -1777,  11903,  -1777,  11903,   7416,  -1158,   7416,  -1158 },
+	{  -1772,  11827,  -1772,  11827,   7502,  -1173,   7502,  -1173 },	{  -1766,  11751,  -1766,  11751,   7588,  -1189,   7588,  -1189 },
+	{  -1760,  11673,  -1760,  11673,   7675,  -1204,   7675,  -1204 },	{  -1753,  11595,  -1753,  11595,   7761,  -1219,   7761,  -1219 },
+	{  -1747,  11517,  -1747,  11517,   7848,  -1234,   7848,  -1234 },	{  -1740,  11439,  -1740,  11439,   7934,  -1249,   7934,  -1249 },
+	{  -1733,  11361,  -1733,  11361,   8020,  -1264,   8020,  -1264 },	{  -1725,  11281,  -1725,  11281,   8107,  -1279,   8107,  -1279 },
+	{  -1717,  11202,  -1717,  11202,   8193,  -1294,   8193,  -1294 },	{  -1709,  11123,  -1709,  11123,   8279,  -1309,   8279,  -1309 },
+	{  -1701,  11043,  -1701,  11043,   8365,  -1323,   8365,  -1323 },	{  -1692,  10962,  -1692,  10962,   8451,  -1337,   8451,  -1337 },
+	{  -1684,  10883,  -1684,  10883,   8537,  -1352,   8537,  -1352 },	{  -1675,  10802,  -1675,  10802,   8623,  -1366,   8623,  -1366 },
+	{  -1665,  10720,  -1665,  10720,   8709,  -1380,   8709,  -1380 },	{  -1656,  10640,  -1656,  10640,   8794,  -1394,   8794,  -1394 },
+	{  -1646,  10557,  -1646,  10557,   8880,  -1407,   8880,  -1407 },	{  -1636,  10476,  -1636,  10476,   8965,  -1421,   8965,  -1421 },
+	{  -1626,  10393,  -1626,  10393,   9051,  -1434,   9051,  -1434 },	{  -1615,  10311,  -1615,  10311,   9136,  -1448,   9136,  -1448 },
+	{  -1604,  10228,  -1604,  10228,   9221,  -1461,   9221,  -1461 },	{  -1594,  10146,  -1594,  10146,   9306,  -1474,   9306,  -1474 },
+	{  -1582,  10062,  -1582,  10062,   9391,  -1487,   9391,  -1487 },	{  -1571,   9979,  -1571,   9979,   9475,  -1499,   9475,  -1499 },
+	{  -1560,   9896,  -1560,   9896,   9560,  -1512,   9560,  -1512 },	{  -1548,   9812,  -1548,   9812,   9644,  -1524,   9644,  -1524 },
+	{  -1536,   9728,  -1536,   9728,   9728,  -1536,   9728,  -1536 },	{  -1524,   9644,  -1524,   9644,   9812,  -1548,   9812,  -1548 },
+	{  -1512,   9560,  -1512,   9560,   9896,  -1560,   9896,  -1560 },	{  -1499,   9475,  -1499,   9475,   9979,  -1571,   9979,  -1571 },
+	{  -1487,   9391,  -1487,   9391,  10062,  -1582,  10062,  -1582 },	{  -1474,   9306,  -1474,   9306,  10146,  -1594,  10146,  -1594 },
+	{  -1461,   9221,  -1461,   9221,  10228,  -1604,  10228,  -1604 },	{  -1448,   9136,  -1448,   9136,  10311,  -1615,  10311,  -1615 },
+	{  -1434,   9051,  -1434,   9051,  10393,  -1626,  10393,  -1626 },	{  -1421,   8965,  -1421,   8965,  10476,  -1636,  10476,  -1636 },
+	{  -1407,   8880,  -1407,   8880,  10557,  -1646,  10557,  -1646 },	{  -1394,   8795,  -1394,   8795,  10639,  -1656,  10639,  -1656 },
+	{  -1380,   8709,  -1380,   8709,  10720,  -1665,  10720,  -1665 },	{  -1366,   8624,  -1366,   8624,  10801,  -1675,  10801,  -1675 },
+	{  -1352,   8538,  -1352,   8538,  10882,  -1684,  10882,  -1684 },	{  -1337,   8450,  -1337,   8450,  10963,  -1692,  10963,  -1692 },
+	{  -1323,   8365,  -1323,   8365,  11043,  -1701,  11043,  -1701 },	{  -1309,   8279,  -1309,   8279,  11123,  -1709,  11123,  -1709 },
+	{  -1294,   8192,  -1294,   8192,  11203,  -1717,  11203,  -1717 },	{  -1279,   8106,  -1279,   8106,  11282,  -1725,  11282,  -1725 },
+	{  -1264,   8020,  -1264,   8020,  11361,  -1733,  11361,  -1733 },	{  -1249,   7934,  -1249,   7934,  11439,  -1740,  11439,  -1740 },
+	{  -1234,   7847,  -1234,   7847,  11518,  -1747,  11518,  -1747 },	{  -1219,   7760,  -1219,   7760,  11596,  -1753,  11596,  -1753 },
+	{  -1204,   7675,  -1204,   7675,  11673,  -1760,  11673,  -1760 },	{  -1189,   7589,  -1189,   7589,  11750,  -1766,  11750,  -1766 },
+	{  -1173,   7502,  -1173,   7502,  11827,  -1772,  11827,  -1772 },	{  -1158,   7415,  -1158,   7415,  11904,  -1777,  11904,  -1777 },
+	{  -1143,   7329,  -1143,   7329,  11980,  -1782,  11980,  -1782 },	{  -1127,   7243,  -1127,   7243,  12055,  -1787,  12055,  -1787 },
+	{  -1111,   7156,  -1111,   7156,  12131,  -1792,  12131,  -1792 },	{  -1096,   7070,  -1096,   7070,  12206,  -1796,  12206,  -1796 },
+	{  -1080,   6984,  -1080,   6984,  12280,  -1800,  12280,  -1800 },	{  -1064,   6898,  -1064,   6898,  12354,  -1804,  12354,  -1804 },
+	{  -1048,   6811,  -1048,   6811,  12428,  -1807,  12428,  -1807 },	{  -1033,   6726,  -1033,   6726,  12501,  -1810,  12501,  -1810 },
+	{  -1017,   6639,  -1017,   6639,  12574,  -1812,  12574,  -1812 },	{  -1001,   6554,  -1001,   6554,  12646,  -1815,  12646,  -1815 },
+	{   -985,   6467,   -985,   6467,  12718,  -1816,  12718,  -1816 },	{   -969,   6382,   -969,   6382,  12789,  -1818,  12789,  -1818 },
+	{   -953,   6296,   -953,   6296,  12860,  -1819,  12860,  -1819 },	{   -937,   6211,   -937,   6211,  12930,  -1820,  12930,  -1820 },
+	{   -921,   6125,   -921,   6125,  13000,  -1820,  13000,  -1820 },	{   -905,   6039,   -905,   6039,  13070,  -1820,  13070,  -1820 },
+	{   -889,   5954,   -889,   5954,  13139,  -1820,  13139,  -1820 },	{   -873,   5869,   -873,   5869,  13207,  -1819,  13207,  -1819 },
+	{   -857,   5784,   -857,   5784,  13275,  -1818,  13275,  -1818 },	{   -841,   5700,   -841,   5700,  13342,  -1817,  13342,  -1817 },
+	{   -825,   5615,   -825,   5615,  13409,  -1815,  13409,  -1815 },	{   -809,   5531,   -809,   5531,  13475,  -1813,  13475,  -1813 },
+	{   -793,   5446,   -793,   5446,  13541,  -1810,  13541,  -1810 },	{   -777,   5362,   -777,   5362,  13606,  -1807,  13606,  -1807 },
+	{   -761,   5278,   -761,   5278,  13671,  -1804,  13671,  -1804 },	{   -746,   5195,   -746,   5195,  13735,  -1800,  13735,  -1800 },
+	{   -730,   5111,   -730,   5111,  13798,  -1795,  13798,  -1795 },	{   -714,   5028,   -714,   5028,  13861,  -1791,  13861,  -1791 },
+	{   -699,   4944,   -699,   4944,  13924,  -1785,  13924,  -1785 },	{   -683,   4862,   -683,   4862,  13985,  -1780,  13985,  -1780 },
+	{   -668,   4780,   -668,   4780,  14046,  -1774,  14046,  -1774 },	{   -652,   4696,   -652,   4696,  14107,  -1767,  14107,  -1767 },
+	{   -637,   4614,   -637,   4614,  14167,  -1760,  14167,  -1760 },	{   -621,   4532,   -621,   4532,  14226,  -1753,  14226,  -1753 },
+	{   -606,   4450,   -606,   4450,  14285,  -1745,  14285,  -1745 },	{   -591,   4369,   -591,   4369,  14343,  -1737,  14343,  -1737 },
+	{   -576,   4288,   -576,   4288,  14400,  -1728,  14400,  -1728 },	{   -561,   4207,   -561,   4207,  14457,  -1719,  14457,  -1719 },
+	{   -546,   4126,   -546,   4126,  14513,  -1709,  14513,  -1709 },	{   -531,   4046,   -531,   4046,  14568,  -1699,  14568,  -1699 },
+	{   -517,   3966,   -517,   3966,  14623,  -1688,  14623,  -1688 },	{   -502,   3886,   -502,   3886,  14677,  -1677,  14677,  -1677 },
+	{   -488,   3807,   -488,   3807,  14730,  -1665,  14730,  -1665 },	{   -474,   3728,   -474,   3728,  14783,  -1653,  14783,  -1653 },
+	{   -459,   3650,   -459,   3650,  14834,  -1641,  14834,  -1641 },	{   -445,   3570,   -445,   3570,  14886,  -1627,  14886,  -1627 },
+	{   -431,   3493,   -431,   3493,  14936,  -1614,  14936,  -1614 },	{   -418,   3416,   -418,   3416,  14986,  -1600,  14986,  -1600 },
+	{   -404,   3338,   -404,   3338,  15035,  -1585,  15035,  -1585 },	{   -391,   3262,   -391,   3262,  15083,  -1570,  15083,  -1570 },
+	{   -377,   3185,   -377,   3185,  15130,  -1554,  15130,  -1554 },	{   -364,   3109,   -364,   3109,  15177,  -1538,  15177,  -1538 },
+	{   -351,   3033,   -351,   3033,  15223,  -1521,  15223,  -1521 },	{   -338,   2958,   -338,   2958,  15268,  -1504,  15268,  -1504 },
+	{   -325,   2882,   -325,   2882,  15313,  -1486,  15313,  -1486 },	{   -313,   2808,   -313,   2808,  15356,  -1467,  15356,  -1467 },
+	{   -301,   2734,   -301,   2734,  15399,  -1448,  15399,  -1448 },	{   -288,   2660,   -288,   2660,  15441,  -1429,  15441,  -1429 },
+	{   -276,   2587,   -276,   2587,  15482,  -1409,  15482,  -1409 },	{   -265,   2514,   -265,   2514,  15523,  -1388,  15523,  -1388 },
+	{   -253,   2442,   -253,   2442,  15562,  -1367,  15562,  -1367 },	{   -242,   2370,   -242,   2370,  15601,  -1345,  15601,  -1345 },
+	{   -231,   2299,   -231,   2299,  15639,  -1323,  15639,  -1323 },	{   -220,   2228,   -220,   2228,  15676,  -1300,  15676,  -1300 },
+	{   -209,   2157,   -209,   2157,  15712,  -1276,  15712,  -1276 },	{   -198,   2087,   -198,   2087,  15747,  -1252,  15747,  -1252 },
+	{   -188,   2017,   -188,   2017,  15782,  -1227,  15782,  -1227 },	{   -178,   1949,   -178,   1949,  15815,  -1202,  15815,  -1202 },
+	{   -168,   1880,   -168,   1880,  15848,  -1176,  15848,  -1176 },	{   -158,   1811,   -158,   1811,  15880,  -1149,  15880,  -1149 },
+	{   -149,   1744,   -149,   1744,  15911,  -1122,  15911,  -1122 },	{   -140,   1677,   -140,   1677,  15941,  -1094,  15941,  -1094 },
+	{   -131,   1611,   -131,   1611,  15970,  -1066,  15970,  -1066 },	{   -122,   1545,   -122,   1545,  15998,  -1037,  15998,  -1037 },
+	{   -114,   1480,   -114,   1480,  16025,  -1007,  16025,  -1007 },	{   -106,   1415,   -106,   1415,  16052,   -977,  16052,   -977 },
+	{    -98,   1351,    -98,   1351,  16077,   -946,  16077,   -946 },	{    -90,   1288,    -90,   1288,  16101,   -915,  16101,   -915 },
+	{    -83,   1224,    -83,   1224,  16125,   -882,  16125,   -882 },	{    -76,   1162,    -76,   1162,  16147,   -849,  16147,   -849 },
+	{    -69,   1100,    -69,   1100,  16169,   -816,  16169,   -816 },	{    -63,   1040,    -63,   1040,  16189,   -782,  16189,   -782 },
+	{    -56,    978,    -56,    978,  16209,   -747,  16209,   -747 },	{    -51,    919,    -51,    919,  16227,   -711,  16227,   -711 },
+	{    -45,    859,    -45,    859,  16245,   -675,  16245,   -675 },	{    -40,    800,    -40,    800,  16262,   -638,  16262,   -638 },
+	{    -35,    743,    -35,    743,  16277,   -601,  16277,   -601 },	{    -30,    684,    -30,    684,  16292,   -562,  16292,   -562 },
+	{    -26,    628,    -26,    628,  16305,   -523,  16305,   -523 },	{    -22,    572,    -22,    572,  16318,   -484,  16318,   -484 },
+	{    -18,    516,    -18,    516,  16329,   -443,  16329,   -443 },	{    -15,    462,    -15,    462,  16339,   -402,  16339,   -402 },
+	{    -12,    407,    -12,    407,  16349,   -360,  16349,   -360 },	{     -9,    354,     -9,    354,  16357,   -318,  16357,   -318 },
+	{     -7,    302,     -7,    302,  16364,   -275,  16364,   -275 },	{     -5,    250,     -5,    250,  16370,   -231,  16370,   -231 },
+	{     -3,    198,     -3,    198,  16375,   -186,  16375,   -186 },	{     -2,    148,     -2,    148,  16379,   -141,  16379,   -141 },
+	{     -1,     98,     -1,     98,  16382,    -95,  16382,    -95 },	{      0,     49,      0,     49,  16383,    -48,  16383,    -48 },
+};
+
+#endif
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp
new file mode 100644
index 000000000..8fe16138a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp
@@ -0,0 +1,1717 @@
+//	VirtualDub - Video processing and capture application
+//	Graphics support library
+//	Copyright (C) 1998-2008 Avery Lee
+//
+//	This program is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	This program is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with this program; if not, write to the Free Software
+//	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <math.h>
+#include <vector>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+#include <vd2/Kasumi/resample.h>
+#include <vd2/Kasumi/tables.h>
+#include <vd2/Kasumi/triblt.h>
+
+namespace {
+	uint32 lerp_RGB888(sint32 a, sint32 b, sint32 x) {
+		sint32 a_rb	= a & 0xff00ff;
+		sint32 a_g	= a & 0x00ff00;
+		sint32 b_rb	= b & 0xff00ff;
+		sint32 b_g	= b & 0x00ff00;
+
+		const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 top_g  = (a_g  + (((b_g  - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+		return top_rb + top_g;
+	}
+
+	uint32 bilerp_RGB888(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+		sint32 a_rb	= a & 0xff00ff;
+		sint32 a_g	= a & 0x00ff00;
+		sint32 b_rb	= b & 0xff00ff;
+		sint32 b_g	= b & 0x00ff00;
+		sint32 c_rb	= c & 0xff00ff;
+		sint32 c_g	= c & 0x00ff00;
+		sint32 d_rb	= d & 0xff00ff;
+		sint32 d_g	= d & 0x00ff00;
+
+		const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 top_g  = (a_g  + (((b_g  - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+		const uint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+		const uint32 bot_g  = (c_g  + (((d_g  - c_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+		const uint32 final_rb = (top_rb + (((bot_rb - top_rb)*y) >> 8)) & 0xff00ff;
+		const uint32 final_g  = (top_g  + (((bot_g  - top_g )*y) >> 8)) & 0x00ff00;
+
+		return final_rb + final_g;
+	}
+
+	uint32 bicubic_RGB888(const uint32 *src0, const uint32 *src1, const uint32 *src2, const uint32 *src3, sint32 x, sint32 y) {
+		const uint32 p00 = src0[0];
+		const uint32 p01 = src0[1];
+		const uint32 p02 = src0[2];
+		const uint32 p03 = src0[3];
+		const uint32 p10 = src1[0];
+		const uint32 p11 = src1[1];
+		const uint32 p12 = src1[2];
+		const uint32 p13 = src1[3];
+		const uint32 p20 = src2[0];
+		const uint32 p21 = src2[1];
+		const uint32 p22 = src2[2];
+		const uint32 p23 = src2[3];
+		const uint32 p30 = src3[0];
+		const uint32 p31 = src3[1];
+		const uint32 p32 = src3[2];
+		const uint32 p33 = src3[3];
+
+		const sint32 *htab = kVDCubicInterpTableFX14_075[x];
+		const sint32 *vtab = kVDCubicInterpTableFX14_075[y];
+
+		const int ch0 = htab[0];
+		const int ch1 = htab[1];
+		const int ch2 = htab[2];
+		const int ch3 = htab[3];
+		const int cv0 = vtab[0];
+		const int cv1 = vtab[1];
+		const int cv2 = vtab[2];
+		const int cv3 = vtab[3];
+
+		int r0 = ((int)((p00>>16)&0xff) * ch0 + (int)((p01>>16)&0xff) * ch1 + (int)((p02>>16)&0xff) * ch2 + (int)((p03>>16)&0xff) * ch3 + 128) >> 8;
+		int g0 = ((int)((p00>> 8)&0xff) * ch0 + (int)((p01>> 8)&0xff) * ch1 + (int)((p02>> 8)&0xff) * ch2 + (int)((p03>> 8)&0xff) * ch3 + 128) >> 8;
+		int b0 = ((int)((p00    )&0xff) * ch0 + (int)((p01    )&0xff) * ch1 + (int)((p02    )&0xff) * ch2 + (int)((p03    )&0xff) * ch3 + 128) >> 8;
+		int r1 = ((int)((p10>>16)&0xff) * ch0 + (int)((p11>>16)&0xff) * ch1 + (int)((p12>>16)&0xff) * ch2 + (int)((p13>>16)&0xff) * ch3 + 128) >> 8;
+		int g1 = ((int)((p10>> 8)&0xff) * ch0 + (int)((p11>> 8)&0xff) * ch1 + (int)((p12>> 8)&0xff) * ch2 + (int)((p13>> 8)&0xff) * ch3 + 128) >> 8;
+		int b1 = ((int)((p10    )&0xff) * ch0 + (int)((p11    )&0xff) * ch1 + (int)((p12    )&0xff) * ch2 + (int)((p13    )&0xff) * ch3 + 128) >> 8;
+		int r2 = ((int)((p20>>16)&0xff) * ch0 + (int)((p21>>16)&0xff) * ch1 + (int)((p22>>16)&0xff) * ch2 + (int)((p23>>16)&0xff) * ch3 + 128) >> 8;
+		int g2 = ((int)((p20>> 8)&0xff) * ch0 + (int)((p21>> 8)&0xff) * ch1 + (int)((p22>> 8)&0xff) * ch2 + (int)((p23>> 8)&0xff) * ch3 + 128) >> 8;
+		int b2 = ((int)((p20    )&0xff) * ch0 + (int)((p21    )&0xff) * ch1 + (int)((p22    )&0xff) * ch2 + (int)((p23    )&0xff) * ch3 + 128) >> 8;
+		int r3 = ((int)((p30>>16)&0xff) * ch0 + (int)((p31>>16)&0xff) * ch1 + (int)((p32>>16)&0xff) * ch2 + (int)((p33>>16)&0xff) * ch3 + 128) >> 8;
+		int g3 = ((int)((p30>> 8)&0xff) * ch0 + (int)((p31>> 8)&0xff) * ch1 + (int)((p32>> 8)&0xff) * ch2 + (int)((p33>> 8)&0xff) * ch3 + 128) >> 8;
+		int b3 = ((int)((p30    )&0xff) * ch0 + (int)((p31    )&0xff) * ch1 + (int)((p32    )&0xff) * ch2 + (int)((p33    )&0xff) * ch3 + 128) >> 8;
+
+		int r = (r0 * cv0 + r1 * cv1 + r2 * cv2 + r3 * cv3 + (1<<19)) >> 20;
+		int g = (g0 * cv0 + g1 * cv1 + g2 * cv2 + g3 * cv3 + (1<<19)) >> 20;
+		int b = (b0 * cv0 + b1 * cv1 + b2 * cv2 + b3 * cv3 + (1<<19)) >> 20;
+
+		if (r<0) r=0; else if (r>255) r=255;
+		if (g<0) g=0; else if (g>255) g=255;
+		if (b<0) b=0; else if (b>255) b=255;
+
+		return (r<<16) + (g<<8) + b;
+	}
+}
+
+namespace {
+	enum {
+		kTop = 1,
+		kBottom = 2,
+		kLeft = 4,
+		kRight = 8,
+		kNear = 16,
+		kFar = 32
+	};
+
+	struct VDTriBltMipInfo {
+		const uint32 *mip;
+		ptrdiff_t pitch;
+		uint32 uvmul, _pad;
+	};
+
+	struct VDTriBltInfo {
+		VDTriBltMipInfo mips[16];
+		uint32 *dst;
+		const uint32 *src;
+		sint32 width;
+		const int *cubictab;
+	};
+
+	struct VDTriBltGenInfo {
+		float	u;
+		float	v;
+		float	rhw;
+		float	dudx;
+		float	dvdx;
+		float	drhwdx;
+	};
+
+	typedef void (*VDTriBltSpanFunction)(const VDTriBltInfo *);
+	typedef void (*VDTriBltGenFunction)(const VDTriBltGenInfo *);
+
+	void vd_triblt_span_point(const VDTriBltInfo *pInfo) {
+		sint32 w = -pInfo->width;
+		uint32 *dst = pInfo->dst + pInfo->width;
+		const uint32 *src = pInfo->src;
+		const uint32 *texture = pInfo->mips[0].mip;
+		const ptrdiff_t texpitch = pInfo->mips[0].pitch;
+
+		do {
+			dst[w] = vdptroffset(texture, texpitch * src[1])[src[0]];
+			src += 2;
+		} while(++w);
+	}
+
+	void vd_triblt_span_bilinear(const VDTriBltInfo *pInfo) {
+		sint32 w = -pInfo->width;
+		uint32 *dst = pInfo->dst + pInfo->width;
+		const uint32 *src = pInfo->src;
+		const uint32 *texture = pInfo->mips[0].mip;
+		const ptrdiff_t texpitch = pInfo->mips[0].pitch;
+
+		do {
+			const sint32 u = src[0];
+			const sint32 v = src[1];
+			src += 2;
+			const uint32 *src1 = vdptroffset(texture, texpitch * (v>>8)) + (u>>8);
+			const uint32 *src2 = vdptroffset(src1, texpitch);
+
+			dst[w] = bilerp_RGB888(src1[0], src1[1], src2[0], src2[1], u&255, v&255);
+		} while(++w);
+	}
+
+	void vd_triblt_span_trilinear(const VDTriBltInfo *pInfo) {
+		sint32 w = -pInfo->width;
+		uint32 *dst = pInfo->dst + pInfo->width;
+		const uint32 *src = pInfo->src;
+
+		do {
+			sint32 u = src[0];
+			sint32 v = src[1];
+			const sint32 lambda = src[2];
+			src += 3;
+
+			const sint32 lod = lambda >> 8;
+
+			const uint32 *texture1 = pInfo->mips[lod].mip;
+			const ptrdiff_t texpitch1 = pInfo->mips[lod].pitch;
+			const uint32 *texture2 = pInfo->mips[lod+1].mip;
+			const ptrdiff_t texpitch2 = pInfo->mips[lod+1].pitch;
+
+			u >>= lod;
+			v >>= lod;
+
+			u += 128;
+			v += 128;
+
+			const uint32 *src1 = vdptroffset(texture1, texpitch1 * (v>>8)) + (u>>8);
+			const uint32 *src2 = vdptroffset(src1, texpitch1);
+			const uint32 p1 = bilerp_RGB888(src1[0], src1[1], src2[0], src2[1], u&255, v&255);
+
+			u += 128;
+			v += 128;
+
+			const uint32 *src3 = vdptroffset(texture2, texpitch2 * (v>>9)) + (u>>9);
+			const uint32 *src4 = vdptroffset(src3, texpitch2);
+			const uint32 p2 = bilerp_RGB888(src3[0], src3[1], src4[0], src4[1], (u>>1)&255, (v>>1)&255);
+
+			dst[w] = lerp_RGB888(p1, p2, lambda & 255);
+		} while(++w);
+	}
+
+	void vd_triblt_span_bicubic_mip_linear(const VDTriBltInfo *pInfo) {
+		sint32 w = -pInfo->width;
+		uint32 *dst = pInfo->dst + pInfo->width;
+		const uint32 *src = pInfo->src;
+
+		do {
+			sint32 u = src[0];
+			sint32 v = src[1];
+			const sint32 lambda = src[2];
+			src += 3;
+
+			const sint32 lod = lambda >> 8;
+
+			const uint32 *texture1 = pInfo->mips[lod].mip;
+			const ptrdiff_t texpitch1 = pInfo->mips[lod].pitch;
+			const uint32 *texture2 = pInfo->mips[lod+1].mip;
+			const ptrdiff_t texpitch2 = pInfo->mips[lod+1].pitch;
+
+			u >>= lod;
+			v >>= lod;
+
+			u += 128;
+			v += 128;
+
+			const uint32 *src1 = vdptroffset(texture1, texpitch1 * (v>>8)) + (u>>8);
+			const uint32 *src2 = vdptroffset(src1, texpitch1);
+			const uint32 *src3 = vdptroffset(src2, texpitch1);
+			const uint32 *src4 = vdptroffset(src3, texpitch1);
+			const uint32 p1 = bicubic_RGB888(src1, src2, src3, src4, u&255, v&255);
+
+			u += 128;
+			v += 128;
+
+			const uint32 *src5 = vdptroffset(texture2, texpitch2 * (v>>9)) + (u>>9);
+			const uint32 *src6 = vdptroffset(src5, texpitch2);
+			const uint32 *src7 = vdptroffset(src6, texpitch2);
+			const uint32 *src8 = vdptroffset(src7, texpitch2);
+			const uint32 p2 = bicubic_RGB888(src5, src6, src7, src8, (u>>1)&255, (v>>1)&255);
+
+			dst[w] = lerp_RGB888(p1, p2, lambda & 255);
+		} while(++w);
+	}
+
+#ifdef _M_IX86
+	extern "C" void vdasm_triblt_span_bilinear_mmx(const VDTriBltInfo *pInfo);
+	extern "C" void vdasm_triblt_span_trilinear_mmx(const VDTriBltInfo *pInfo);
+	extern "C" void vdasm_triblt_span_bicubic_mip_linear_mmx(const VDTriBltInfo *pInfo);
+	extern "C" void vdasm_triblt_span_bicubic_mip_linear_sse2(const VDTriBltInfo *pInfo);
+	extern "C" void vdasm_triblt_span_point(const VDTriBltInfo *pInfo);
+#endif
+
+	struct VDTriBltTransformedVertex {
+		float x, y, z;
+		union {
+			float w;
+			float rhw;
+		};
+		float r, g, b, a;
+		float u, v;
+		int outcode;
+
+		void interp(const VDTriBltTransformedVertex *v1, const VDTriBltTransformedVertex *v2, float alpha) {
+			x = v1->x + alpha * (v2->x - v1->x);
+			y = v1->y + alpha * (v2->y - v1->y);
+			z = v1->z + alpha * (v2->z - v1->z);
+			w = v1->w + alpha * (v2->w - v1->w);
+
+			r = v1->r + alpha * (v2->r - v1->r);
+			g = v1->g + alpha * (v2->g - v1->g);
+			b = v1->b + alpha * (v2->b - v1->b);
+			a = v1->a + alpha * (v2->a - v1->a);
+
+			u = v1->u + alpha * (v2->u - v1->u);
+			v = v1->v + alpha * (v2->v - v1->v);
+
+			outcode	= (x < -w ? kLeft : 0)
+					+ (x > +w ? kRight : 0)
+					+ (y < -w ? kTop : 0)
+					+ (y > +w ? kBottom : 0)
+					+ (z < -w ? kNear : 0)
+					+ (z > +w ? kFar : 0);
+		}
+	};
+
+	void TransformVerts(VDTriBltTransformedVertex *dst, const VDTriBltVertex *src, int nVerts, const float xform[16]) {
+		const float xflocal[16]={
+			xform[ 0],	xform[ 1],	xform[ 2],	xform[ 3],
+			xform[ 4],	xform[ 5],	xform[ 6],	xform[ 7],
+			xform[ 8],	xform[ 9],	xform[10],	xform[11],
+			xform[12],	xform[13],	xform[14],	xform[15],
+		};
+
+		if (nVerts <= 0)
+			return;
+
+		do {
+			const float x0 = src->x;
+			const float y0 = src->y;
+			const float z0 = src->z;
+
+			const float w	= x0*xflocal[12] + y0*xflocal[13] + z0*xflocal[14] + xflocal[15];
+			const float x   = x0*xflocal[ 0] + y0*xflocal[ 1] + z0*xflocal[ 2] + xflocal[ 3];
+			const float y   = x0*xflocal[ 4] + y0*xflocal[ 5] + z0*xflocal[ 6] + xflocal[ 7];
+			const float z   = x0*xflocal[ 8] + y0*xflocal[ 9] + z0*xflocal[10] + xflocal[11];
+
+			int outcode = 0;
+
+			if (x < -w)		outcode += kLeft;
+			if (x > w)		outcode += kRight;
+			if (y < -w)		outcode += kTop;
+			if (y > w)		outcode += kBottom;
+			if (z < -w)		outcode += kNear;
+			if (z > w)		outcode += kFar;
+
+			dst->x = x;
+			dst->y = y;
+			dst->z = z;
+			dst->w = w;
+			dst->u = src->u;
+			dst->v = src->v;
+			dst->r = 1.0f;
+			dst->g = 1.0f;
+			dst->b = 1.0f;
+			dst->a = 1.0f;
+			dst->outcode = outcode;
+
+			++src;
+			++dst;
+		} while(--nVerts);
+	}
+
+	void TransformVerts(VDTriBltTransformedVertex *dst, const VDTriColorVertex *src, int nVerts, const float xform[16]) {
+		const float xflocal[16]={
+			xform[ 0],	xform[ 1],	xform[ 2],	xform[ 3],
+			xform[ 4],	xform[ 5],	xform[ 6],	xform[ 7],
+			xform[ 8],	xform[ 9],	xform[10],	xform[11],
+			xform[12],	xform[13],	xform[14],	xform[15],
+		};
+
+		if (nVerts <= 0)
+			return;
+
+		do {
+			const float x0 = src->x;
+			const float y0 = src->y;
+			const float z0 = src->z;
+
+			const float w	= x0*xflocal[12] + y0*xflocal[13] + z0*xflocal[14] + xflocal[15];
+			const float x   = x0*xflocal[ 0] + y0*xflocal[ 1] + z0*xflocal[ 2] + xflocal[ 3];
+			const float y   = x0*xflocal[ 4] + y0*xflocal[ 5] + z0*xflocal[ 6] + xflocal[ 7];
+			const float z   = x0*xflocal[ 8] + y0*xflocal[ 9] + z0*xflocal[10] + xflocal[11];
+
+			int outcode = 0;
+
+			if (x < -w)		outcode += kLeft;
+			if (x > w)		outcode += kRight;
+			if (y < -w)		outcode += kTop;
+			if (y > w)		outcode += kBottom;
+			if (z < -w)		outcode += kNear;
+			if (z > w)		outcode += kFar;
+
+			dst->x = x;
+			dst->y = y;
+			dst->z = z;
+			dst->w = w;
+			dst->u = 0.0f;
+			dst->v = 0.0f;
+			dst->r = src->r;
+			dst->g = src->g;
+			dst->b = src->b;
+			dst->a = src->a;
+			dst->outcode = outcode;
+
+			++src;
+			++dst;
+		} while(--nVerts);
+	}
+
+	struct VDTriangleSetupInfo {
+		const VDTriBltTransformedVertex *pt, *pr, *pl;
+		VDTriBltTransformedVertex tmp0, tmp1, tmp2;
+	};
+
+	void SetupTri(
+			VDTriangleSetupInfo& setup,
+			VDPixmap& dst,
+			const VDTriBltTransformedVertex *vx0,
+			const VDTriBltTransformedVertex *vx1,
+			const VDTriBltTransformedVertex *vx2,
+			const VDTriBltFilterMode *filterMode
+			)
+	{
+		setup.tmp0 = *vx0;
+		setup.tmp1 = *vx1;
+		setup.tmp2 = *vx2;
+
+		// adjust UVs for filter mode
+		if (filterMode) {
+			switch(*filterMode) {
+			case kTriBltFilterBilinear:
+				setup.tmp0.u += 0.5f;
+				setup.tmp0.v += 0.5f;
+				setup.tmp1.u += 0.5f;
+				setup.tmp1.v += 0.5f;
+				setup.tmp2.u += 0.5f;
+				setup.tmp2.v += 0.5f;
+			case kTriBltFilterTrilinear:
+			case kTriBltFilterBicubicMipLinear:
+				setup.tmp0.u *= 256.0f;
+				setup.tmp0.v *= 256.0f;
+				setup.tmp1.u *= 256.0f;
+				setup.tmp1.v *= 256.0f;
+				setup.tmp2.u *= 256.0f;
+				setup.tmp2.v *= 256.0f;
+				break;
+			case kTriBltFilterPoint:
+				setup.tmp0.u += 1.0f;
+				setup.tmp0.v += 1.0f;
+				setup.tmp1.u += 1.0f;
+				setup.tmp1.v += 1.0f;
+				setup.tmp2.u += 1.0f;
+				setup.tmp2.v += 1.0f;
+				break;
+			}
+		}
+
+		// do perspective divide and NDC space conversion
+		const float xscale = dst.w * 0.5f;
+		const float yscale = dst.h * 0.5f;
+
+		setup.tmp0.rhw = 1.0f / setup.tmp0.w;
+		setup.tmp0.x = (1.0f+setup.tmp0.x*setup.tmp0.rhw)*xscale;
+		setup.tmp0.y = (1.0f+setup.tmp0.y*setup.tmp0.rhw)*yscale;
+		setup.tmp0.u *= setup.tmp0.rhw;
+		setup.tmp0.v *= setup.tmp0.rhw;
+		setup.tmp0.r *= setup.tmp0.rhw;
+		setup.tmp0.g *= setup.tmp0.rhw;
+		setup.tmp0.b *= setup.tmp0.rhw;
+		setup.tmp0.a *= setup.tmp0.rhw;
+		setup.tmp1.rhw = 1.0f / setup.tmp1.w;
+		setup.tmp1.x = (1.0f+setup.tmp1.x*setup.tmp1.rhw)*xscale;
+		setup.tmp1.y = (1.0f+setup.tmp1.y*setup.tmp1.rhw)*yscale;
+		setup.tmp1.u *= setup.tmp1.rhw;
+		setup.tmp1.v *= setup.tmp1.rhw;
+		setup.tmp1.r *= setup.tmp1.rhw;
+		setup.tmp1.g *= setup.tmp1.rhw;
+		setup.tmp1.b *= setup.tmp1.rhw;
+		setup.tmp1.a *= setup.tmp1.rhw;
+		setup.tmp2.rhw = 1.0f / setup.tmp2.w;
+		setup.tmp2.x = (1.0f+setup.tmp2.x*setup.tmp2.rhw)*xscale;
+		setup.tmp2.y = (1.0f+setup.tmp2.y*setup.tmp2.rhw)*yscale;
+		setup.tmp2.u *= setup.tmp2.rhw;
+		setup.tmp2.v *= setup.tmp2.rhw;
+		setup.tmp2.r *= setup.tmp2.rhw;
+		setup.tmp2.g *= setup.tmp2.rhw;
+		setup.tmp2.b *= setup.tmp2.rhw;
+		setup.tmp2.a *= setup.tmp2.rhw;
+
+		// verify clipping
+		VDASSERT(setup.tmp0.x >= 0 && setup.tmp0.x <= dst.w);
+		VDASSERT(setup.tmp1.x >= 0 && setup.tmp1.x <= dst.w);
+		VDASSERT(setup.tmp2.x >= 0 && setup.tmp2.x <= dst.w);
+		VDASSERT(setup.tmp0.y >= 0 && setup.tmp0.y <= dst.h);
+		VDASSERT(setup.tmp1.y >= 0 && setup.tmp1.y <= dst.h);
+		VDASSERT(setup.tmp2.y >= 0 && setup.tmp2.y <= dst.h);
+
+		vx0 = &setup.tmp0;
+		vx1 = &setup.tmp1;
+		vx2 = &setup.tmp2;
+
+		const VDTriBltTransformedVertex *pt, *pl, *pr;
+
+		// sort points
+		if (vx0->y < vx1->y)		// 1 < 2
+			if (vx0->y < vx2->y) {	// 1 < 2,3
+				pt = vx0;
+				pr = vx1;
+				pl = vx2;
+			} else {				// 3 < 1 < 2
+				pt = vx2;
+				pr = vx0;
+				pl = vx1;
+			}
+		else						// 2 < 1
+			if (vx1->y < vx2->y) {	// 2 < 1,3
+				pt = vx1;
+				pr = vx2;
+				pl = vx0;
+			} else {				// 3 < 2 < 1
+				pt = vx2;
+				pr = vx0;
+				pl = vx1;
+			}
+
+		setup.pl = pl;
+		setup.pt = pt;
+		setup.pr = pr;
+	}
+
+	void RenderTri(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+							const VDTriBltTransformedVertex *vx0,
+							const VDTriBltTransformedVertex *vx1,
+							const VDTriBltTransformedVertex *vx2,
+							VDTriBltFilterMode filterMode,
+							float mipMapLODBias)
+	{
+		VDTriangleSetupInfo setup;
+
+		SetupTri(setup, dst, vx0, vx1, vx2, &filterMode);
+
+		const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+
+		const float x10 = pl->x - pt->x;
+		const float x20 = pr->x - pt->x;
+		const float y10 = pl->y - pt->y;
+		const float y20 = pr->y - pt->y;
+		const float A = x20*y10 - x10*y20;
+
+		if (A <= 0.f)
+			return;
+
+		float invA = 0.f;
+		if (A >= 1e-5f)
+			invA = 1.0f / A;
+
+		float x10_A = x10 * invA;
+		float x20_A = x20 * invA;
+		float y10_A = y10 * invA;
+		float y20_A = y20 * invA;
+
+		float u10 = pl->u - pt->u;
+		float u20 = pr->u - pt->u;
+		float v10 = pl->v - pt->v;
+		float v20 = pr->v - pt->v;
+		float rhw10 = pl->rhw - pt->rhw;
+		float rhw20 = pr->rhw - pt->rhw;
+
+		float dudx = u20*y10_A - u10*y20_A;
+		float dudy = u10*x20_A - u20*x10_A;
+		float dvdx = v20*y10_A - v10*y20_A;
+		float dvdy = v10*x20_A - v20*x10_A;
+		float drhwdx = rhw20*y10_A - rhw10*y20_A;
+		float drhwdy = rhw10*x20_A - rhw20*x10_A;
+
+		// Compute edge walking parameters
+
+		float dxl1=0, dxr1=0, dul1=0, dvl1=0, drhwl1=0;
+		float dxl2=0, dxr2=0, dul2=0, dvl2=0, drhwl2=0;
+
+		// Compute left-edge interpolation parameters for first half.
+
+		if (pl->y != pt->y) {
+			dxl1 = (pl->x - pt->x) / (pl->y - pt->y);
+
+			dul1 = dudy + dxl1 * dudx;
+			dvl1 = dvdy + dxl1 * dvdx;
+			drhwl1 = drhwdy + dxl1 * drhwdx;
+		}
+
+		// Compute right-edge interpolation parameters for first half.
+
+		if (pr->y != pt->y) {
+			dxr1 = (pr->x - pt->x) / (pr->y - pt->y);
+		}
+
+		// Compute third-edge interpolation parameters.
+
+		if (pr->y != pl->y) {
+			dxl2 = (pr->x - pl->x) / (pr->y - pl->y);
+
+			dul2 = dudy + dxl2 * dudx;
+			dvl2 = dvdy + dxl2 * dvdx;
+			drhwl2 = drhwdy + dxl2 * drhwdx;
+
+			dxr2 = dxl2;
+		}
+
+		// Initialize parameters for first half.
+		//
+		// We place pixel centers at (x+0.5, y+0.5).
+
+		double xl, xr, ul, vl, rhwl, yf;
+		int y, y1, y2;
+
+		// y_start < y+0.5 to include pixel y.
+
+		y = (int)floor(pt->y + 0.5);
+		yf = (y+0.5) - pt->y;
+
+		xl = pt->x + dxl1 * yf;
+		xr = pt->x + dxr1 * yf;
+		ul = pt->u + dul1 * yf;
+		vl = pt->v + dvl1 * yf;
+		rhwl = pt->rhw + drhwl1 * yf;
+
+		// Initialize parameters for second half.
+
+		double xl2, xr2, ul2, vl2, rhwl2;
+
+		if (pl->y > pr->y) {		// Left edge is long side
+			dxl2 = dxl1;
+			dul2 = dul1;
+			dvl2 = dvl1;
+			drhwl2 = drhwl1;
+
+			y1 = (int)floor(pr->y + 0.5);
+			y2 = (int)floor(pl->y + 0.5);
+
+			yf = (y1+0.5) - pr->y;
+
+			// Step left edge.
+
+			xl2 = xl + dxl1 * (y1 - y);
+			ul2 = ul + dul1 * (y1 - y);
+			vl2 = vl + dvl1 * (y1 - y);
+			rhwl2 = rhwl + drhwl1 * (y1 - y);
+
+			// Prestep right edge.
+
+			xr2 = pr->x + dxr2 * yf;
+		} else {					// Right edge is long side
+			dxr2 = dxr1;
+
+			y1 = (int)floor(pl->y + 0.5);
+			y2 = (int)floor(pr->y + 0.5);
+
+			yf = (y1+0.5) - pl->y;
+
+			// Prestep left edge.
+
+			xl2 = pl->x + dxl2 * yf;
+			ul2 = pl->u + dul2 * yf;
+			vl2 = pl->v + dvl2 * yf;
+			rhwl2 = pl->rhw + drhwl2 * yf;
+
+			// Step right edge.
+
+			xr2 = xr + dxr1 * (y1 - y);
+		}
+
+		// rasterize
+		const ptrdiff_t dstpitch = dst.pitch;
+		uint32 *dstp = (uint32 *)((char *)dst.data + dstpitch * y);
+
+		VDTriBltInfo texinfo;
+		VDTriBltSpanFunction drawSpan;
+		uint32 cpuflags = CPUGetEnabledExtensions();
+
+		bool triBlt16 = false;
+
+		switch(filterMode) {
+		case kTriBltFilterBicubicMipLinear:
+#ifdef _M_IX86
+			if (cpuflags & CPUF_SUPPORTS_SSE2) {
+				drawSpan = vdasm_triblt_span_bicubic_mip_linear_sse2;
+				triBlt16 = true;
+			} else if (cpuflags & CPUF_SUPPORTS_MMX) {
+				drawSpan = vdasm_triblt_span_bicubic_mip_linear_mmx;
+				triBlt16 = true;
+			} else
+#endif
+				drawSpan = vd_triblt_span_bicubic_mip_linear;
+			break;
+		case kTriBltFilterTrilinear:
+#ifdef _M_IX86
+			if (cpuflags & CPUF_SUPPORTS_MMX) {
+				drawSpan = vdasm_triblt_span_trilinear_mmx;
+				triBlt16 = true;
+			} else
+#endif
+				drawSpan = vd_triblt_span_trilinear;
+			break;
+		case kTriBltFilterBilinear:
+#ifdef _M_IX86
+			if (cpuflags & CPUF_SUPPORTS_MMX) {
+				drawSpan = vdasm_triblt_span_bilinear_mmx;
+				triBlt16 = true;
+			} else
+#endif
+				drawSpan = vd_triblt_span_bilinear;
+			break;
+		case kTriBltFilterPoint:
+			drawSpan = vd_triblt_span_point;
+			break;
+		}
+
+		float rhobase = sqrtf(std::max<float>(dudx*dudx + dvdx*dvdx, dudy*dudy + dvdy*dvdy) * (1.0f / 65536.0f)) * powf(2.0f, mipMapLODBias);
+
+		if (triBlt16) {
+			ul *= 256.0f;
+			vl *= 256.0f;
+			ul2 *= 256.0f;
+			vl2 *= 256.0f;
+			dul1 *= 256.0f;
+			dvl1 *= 256.0f;
+			dul2 *= 256.0f;
+			dvl2 *= 256.0f;
+			dudx *= 256.0f;
+			dvdx *= 256.0f;
+			dudy *= 256.0f;
+			dvdy *= 256.0f;
+		}
+
+		int minx1 = (int)floor(std::min<float>(std::min<float>(pl->x, pr->x), pt->x) + 0.5);
+		int maxx2 = (int)floor(std::max<float>(std::max<float>(pl->x, pr->x), pt->x) + 0.5);
+
+		uint32 *const spanptr = new uint32[3 * (maxx2 - minx1)];
+
+		while(y < y2) {
+			if (y == y1) {
+				xl = xl2;
+				xr = xr2;
+				ul = ul2;
+				vl = vl2;
+				rhwl = rhwl2;
+				dxl1 = dxl2;
+				dxr1 = dxr2;
+				dul1 = dul2;
+				dvl1 = dvl2;
+				drhwl1 = drhwl2;
+			}
+
+			int x1, x2;
+			double xf;
+			double u, v, rhw;
+
+			// x_left must be less than (x+0.5) to include pixel x.
+
+			x1		= (int)floor(xl + 0.5);
+			x2		= (int)floor(xr + 0.5);
+			xf		= (x1+0.5) - xl;
+			
+			u		= ul + xf * dudx;
+			v		= vl + xf * dvdx;
+			rhw		= rhwl + xf * drhwdx;
+
+			int x = x1;
+			uint32 *spanp = spanptr;
+
+			float w = 1.0f / (float)rhw;
+
+			if (x < x2) {
+				if (filterMode >= kTriBltFilterTrilinear) {
+					do {
+						int utexel = VDRoundToIntFastFullRange(u * w);
+						int vtexel = VDRoundToIntFastFullRange(v * w);
+						union{ float f; sint32 i; } rho = {rhobase * w};
+
+						int lambda = ((rho.i - 0x3F800000) >> (23-8));
+						if (lambda < 0)
+							lambda = 0;
+						if (lambda >= (nMipmaps<<8)-256)
+							lambda = (nMipmaps<<8)-257;
+
+						spanp[0] = utexel;
+						spanp[1] = vtexel;
+						spanp[2] = lambda;
+						spanp += 3;
+
+						u += dudx;
+						v += dvdx;
+						rhw += drhwdx;
+
+						w *= (2.0f - w*(float)rhw);
+					} while(++x < x2);
+				} else {
+					do {
+						int utexel = VDFloorToInt(u * w);
+						int vtexel = VDFloorToInt(v * w);
+
+						spanp[0] = utexel;
+						spanp[1] = vtexel;
+						spanp += 2;
+
+						u += dudx;
+						v += dvdx;
+						rhw += drhwdx;
+
+						w *= (2.0f - w*(float)rhw);
+					} while(++x < x2);
+				}
+			}
+
+			for(int i=0; i<nMipmaps; ++i) {
+				texinfo.mips[i].mip		= (const uint32 *)pSources[i]->data;
+				texinfo.mips[i].pitch	= pSources[i]->pitch;
+				texinfo.mips[i].uvmul	= (pSources[i]->pitch << 16) + 4;
+			}
+			texinfo.dst = dstp+x1;
+			texinfo.src = spanptr;
+			texinfo.width = x2-x1;
+
+			if (texinfo.width>0)
+				drawSpan(&texinfo);
+
+			dstp = vdptroffset(dstp, dstpitch);
+			xl += dxl1;
+			xr += dxr1;
+			ul += dul1;
+			vl += dvl1;
+			rhwl += drhwl1;
+
+			++y;
+		}
+
+		delete[] spanptr;
+	}
+
+	void FillTri(VDPixmap& dst, uint32 c,
+					const VDTriBltTransformedVertex *vx0,
+					const VDTriBltTransformedVertex *vx1,
+					const VDTriBltTransformedVertex *vx2
+					)
+	{
+
+		VDTriangleSetupInfo setup;
+
+		SetupTri(setup, dst, vx0, vx1, vx2, NULL);
+
+		const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+
+		// Compute edge walking parameters
+		float dxl1=0, dxr1=0;
+		float dxl2=0, dxr2=0;
+
+		float x_lt = pl->x - pt->x;
+		float x_rt = pr->x - pt->x;
+		float x_rl = pr->x - pl->x;
+		float y_lt = pl->y - pt->y;
+		float y_rt = pr->y - pt->y;
+		float y_rl = pr->y - pl->y;
+
+		// reject backfaces
+		if (x_lt*y_rt >= x_rt*y_lt)
+			return;
+
+		// Compute left-edge interpolation parameters for first half.
+		if (pl->y != pt->y)
+			dxl1 = x_lt / y_lt;
+
+		// Compute right-edge interpolation parameters for first half.
+		if (pr->y != pt->y)
+			dxr1 = x_rt / y_rt;
+
+		// Compute third-edge interpolation parameters.
+		if (pr->y != pl->y) {
+			dxl2 = x_rl / y_rl;
+
+			dxr2 = dxl2;
+		}
+
+		// Initialize parameters for first half.
+		//
+		// We place pixel centers at (x+0.5, y+0.5).
+
+		double xl, xr, yf;
+		int y, y1, y2;
+
+		// y_start < y+0.5 to include pixel y.
+
+		y = (int)floor(pt->y + 0.5);
+		yf = (y+0.5) - pt->y;
+
+		xl = pt->x + dxl1 * yf;
+		xr = pt->x + dxr1 * yf;
+
+		// Initialize parameters for second half.
+		double xl2, xr2;
+
+		if (pl->y > pr->y) {		// Left edge is long side
+			dxl2 = dxl1;
+
+			y1 = (int)floor(pr->y + 0.5);
+			y2 = (int)floor(pl->y + 0.5);
+
+			yf = (y1+0.5) - pr->y;
+
+			// Prestep right edge.
+			xr2 = pr->x + dxr2 * yf;
+
+			// Step left edge.
+			xl2 = xl + dxl1 * (y1 - y);
+		} else {					// Right edge is long side
+			dxr2 = dxr1;
+
+			y1 = (int)floor(pl->y + 0.5);
+			y2 = (int)floor(pr->y + 0.5);
+
+			yf = (y1+0.5) - pl->y;
+
+			// Prestep left edge.
+			xl2 = pl->x + dxl2 * yf;
+
+			// Step right edge.
+			xr2 = xr + dxr1 * (y1 - y);
+		}
+
+		// rasterize
+		const ptrdiff_t dstpitch = dst.pitch;
+		uint32 *dstp = (uint32 *)((char *)dst.data + dstpitch * y);
+
+		while(y < y2) {
+			if (y == y1) {
+				xl = xl2;
+				xr = xr2;
+				dxl1 = dxl2;
+				dxr1 = dxr2;
+			}
+
+			int x1, x2;
+			double xf;
+
+			// x_left must be less than (x+0.5) to include pixel x.
+
+			x1		= (int)floor(xl + 0.5);
+			x2		= (int)floor(xr + 0.5);
+			xf		= (x1+0.5) - xl;
+			
+			while(x1 < x2)
+				dstp[x1++] = c;
+
+			dstp = vdptroffset(dstp, dstpitch);
+			xl += dxl1;
+			xr += dxr1;
+			++y;
+		}
+	}
+
+	void FillTriGrad(VDPixmap& dst,
+					const VDTriBltTransformedVertex *vx0,
+					const VDTriBltTransformedVertex *vx1,
+					const VDTriBltTransformedVertex *vx2
+					)
+	{
+
+		VDTriangleSetupInfo setup;
+
+		SetupTri(setup, dst, vx0, vx1, vx2, NULL);
+
+		const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+		const float x10 = pl->x - pt->x;
+		const float x20 = pr->x - pt->x;
+		const float y10 = pl->y - pt->y;
+		const float y20 = pr->y - pt->y;
+		const float A = x20*y10 - x10*y20;
+
+		if (A <= 0.f)
+			return;
+
+		float invA = 0.f;
+		if (A >= 1e-5f)
+			invA = 1.0f / A;
+
+		float x10_A = x10 * invA;
+		float x20_A = x20 * invA;
+		float y10_A = y10 * invA;
+		float y20_A = y20 * invA;
+
+		float r10 = pl->r - pt->r;
+		float r20 = pr->r - pt->r;
+		float g10 = pl->g - pt->g;
+		float g20 = pr->g - pt->g;
+		float b10 = pl->b - pt->b;
+		float b20 = pr->b - pt->b;
+		float a10 = pl->a - pt->a;
+		float a20 = pr->a - pt->a;
+		float rhw10 = pl->rhw - pt->rhw;
+		float rhw20 = pr->rhw - pt->rhw;
+
+		float drdx = r20*y10_A - r10*y20_A;
+		float drdy = r10*x20_A - r20*x10_A;
+		float dgdx = g20*y10_A - g10*y20_A;
+		float dgdy = g10*x20_A - g20*x10_A;
+		float dbdx = b20*y10_A - b10*y20_A;
+		float dbdy = b10*x20_A - b20*x10_A;
+		float dadx = a20*y10_A - a10*y20_A;
+		float dady = a10*x20_A - a20*x10_A;
+		float drhwdx = rhw20*y10_A - rhw10*y20_A;
+		float drhwdy = rhw10*x20_A - rhw20*x10_A;
+
+		// Compute edge walking parameters
+		float dxl1=0;
+		float drl1=0;
+		float dgl1=0;
+		float dbl1=0;
+		float dal1=0;
+		float drhwl1=0;
+		float dxr1=0;
+		float dxl2=0;
+		float drl2=0;
+		float dgl2=0;
+		float dbl2=0;
+		float dal2=0;
+		float drhwl2=0;
+		float dxr2=0;
+
+		float x_lt = pl->x - pt->x;
+		float x_rt = pr->x - pt->x;
+		float x_rl = pr->x - pl->x;
+		float y_lt = pl->y - pt->y;
+		float y_rt = pr->y - pt->y;
+		float y_rl = pr->y - pl->y;
+
+		// Compute left-edge interpolation parameters for first half.
+		if (pl->y != pt->y) {
+			dxl1 = x_lt / y_lt;
+			drl1 = drdy + dxl1 * drdx;
+			dgl1 = dgdy + dxl1 * dgdx;
+			dbl1 = dbdy + dxl1 * dbdx;
+			dal1 = dady + dxl1 * dadx;
+			drhwl1 = drhwdy + dxl1 * drhwdx;
+		}
+
+		// Compute right-edge interpolation parameters for first half.
+		if (pr->y != pt->y)
+			dxr1 = x_rt / y_rt;
+
+		// Compute third-edge interpolation parameters.
+		if (pr->y != pl->y) {
+			dxl2 = x_rl / y_rl;
+
+			drl2 = drdy + dxl2 * drdx;
+			dgl2 = dgdy + dxl2 * dgdx;
+			dbl2 = dbdy + dxl2 * dbdx;
+			dal2 = dady + dxl2 * dadx;
+			drhwl2 = drhwdy + dxl2 * drhwdx;
+
+			dxr2 = dxl2;
+		}
+
+		// Initialize parameters for first half.
+		//
+		// We place pixel centers at (x+0.5, y+0.5).
+
+		double xl, xr, yf;
+		double rl, gl, bl, al, rhwl;
+		double rl2, gl2, bl2, al2, rhwl2;
+		int y, y1, y2;
+
+		// y_start < y+0.5 to include pixel y.
+
+		y = (int)floor(pt->y + 0.5);
+		yf = (y+0.5) - pt->y;
+
+		xl = pt->x + dxl1 * yf;
+		xr = pt->x + dxr1 * yf;
+		rl = pt->r + drl1 * yf;
+		gl = pt->g + dgl1 * yf;
+		bl = pt->b + dbl1 * yf;
+		al = pt->a + dal1 * yf;
+		rhwl = pt->rhw + drhwl1 * yf;
+
+		// Initialize parameters for second half.
+		double xl2, xr2;
+
+		if (pl->y > pr->y) {		// Left edge is long side
+			dxl2 = dxl1;
+			drl2 = drl1;
+			dgl2 = dgl1;
+			dbl2 = dbl1;
+			dal2 = dal1;
+			drhwl2 = drhwl1;
+
+			y1 = (int)floor(pr->y + 0.5);
+			y2 = (int)floor(pl->y + 0.5);
+
+			yf = (y1+0.5) - pr->y;
+
+			// Step left edge.
+			xl2 = xl + dxl1 * (y1 - y);
+			rl2 = rl + drl1 * (y1 - y);
+			gl2 = gl + dgl1 * (y1 - y);
+			bl2 = bl + dbl1 * (y1 - y);
+			al2 = al + dal1 * (y1 - y);
+			rhwl2 = rhwl + drhwl1 * (y1 - y);
+
+			// Prestep right edge.
+			xr2 = pr->x + dxr2 * yf;
+		} else {					// Right edge is long side
+			dxr2 = dxr1;
+
+			y1 = (int)floor(pl->y + 0.5);
+			y2 = (int)floor(pr->y + 0.5);
+
+			yf = (y1+0.5) - pl->y;
+
+			// Prestep left edge.
+			xl2 = pl->x + dxl2 * yf;
+			rl2 = pl->r + drl2 * yf;
+			gl2 = pl->g + dgl2 * yf;
+			bl2 = pl->b + dbl2 * yf;
+			al2 = pl->a + dal2 * yf;
+			rhwl2 = pl->rhw + drhwl2 * yf;
+
+			// Step right edge.
+			xr2 = xr + dxr2 * (y1 - y);
+		}
+
+		// rasterize
+		const ptrdiff_t dstpitch = dst.pitch;
+		char *dstp0 = (char *)dst.data + dstpitch * y;
+
+		while(y < y2) {
+			if (y == y1) {
+				xl = xl2;
+				xr = xr2;
+				rl = rl2;
+				gl = gl2;
+				bl = bl2;
+				al = al2;
+				rhwl = rhwl2;
+				dxl1 = dxl2;
+				drl1 = drl2;
+				dgl1 = dgl2;
+				dbl1 = dbl2;
+				dal1 = dal2;
+				drhwl1 = drhwl2;
+				dxr1 = dxr2;
+			}
+
+			int x1, x2;
+			double xf;
+			double r, g, b, a, rhw;
+
+			// x_left must be less than (x+0.5) to include pixel x.
+
+			x1		= (int)floor(xl + 0.5);
+			x2		= (int)floor(xr + 0.5);
+			xf		= (x1+0.5) - xl;
+			
+			r		= rl + xf * drdx;
+			g		= gl + xf * dgdx;
+			b		= bl + xf * dbdx;
+			a		= al + xf * dadx;
+			rhw		= rhwl + xf * drhwdx;
+
+			float w = 1.0f / (float)rhw;
+
+			if (x1 < x2) {
+				if (dst.format == nsVDPixmap::kPixFormat_XRGB8888) {
+					uint32 *dstp = (uint32 *)dstp0;
+
+					do {
+						float sr = (float)(r * w);
+						float sg = (float)(g * w);
+						float sb = (float)(b * w);
+						float sa = (float)(a * w);
+
+						uint8 ir = VDClampedRoundFixedToUint8Fast(sr);
+						uint8 ig = VDClampedRoundFixedToUint8Fast(sg);
+						uint8 ib = VDClampedRoundFixedToUint8Fast(sb);
+						uint8 ia = VDClampedRoundFixedToUint8Fast(sa);
+
+						dstp[x1] = ((uint32)ia << 24) + ((uint32)ir << 16) + ((uint32)ig << 8) + ib;
+
+						r += drdx;
+						g += dgdx;
+						b += dbdx;
+						a += dadx;
+						rhw += drhwdx;
+
+						w *= (2.0f - w*(float)rhw);
+					} while(++x1 < x2);
+				} else {
+					uint8 *dstp = (uint8 *)dstp0;
+
+					do {
+						float sg = (float)(g * w);
+
+						uint8 ig = VDClampedRoundFixedToUint8Fast(sg);
+
+						dstp[x1] = ig;
+
+						g += dgdx;
+						rhw += drhwdx;
+
+						w *= (2.0f - w*(float)rhw);
+					} while(++x1 < x2);
+				}
+			}
+
+			dstp0 = vdptroffset(dstp0, dstpitch);
+			xl += dxl1;
+			rl += drl1;
+			gl += dgl1;
+			bl += dbl1;
+			al += dal1;
+			rhwl += drhwl1;
+			xr += dxr1;
+			++y;
+		}
+	}
+
+	struct VDTriClipWorkspace {
+		VDTriBltTransformedVertex *vxheapptr[2][19];
+		VDTriBltTransformedVertex vxheap[21];
+	};
+
+	VDTriBltTransformedVertex **VDClipTriangle(VDTriClipWorkspace& ws,
+						const VDTriBltTransformedVertex *vx0,
+						const VDTriBltTransformedVertex *vx1,
+						const VDTriBltTransformedVertex *vx2,
+						int orflags) {
+		// Each line segment can intersect all six planes, meaning the maximum bound is
+		// 18 vertices.  Add 3 for the original.
+
+		VDTriBltTransformedVertex *vxheapnext;
+		VDTriBltTransformedVertex **vxlastheap = ws.vxheapptr[0], **vxnextheap = ws.vxheapptr[1];
+
+		ws.vxheap[0]	= *vx0;
+		ws.vxheap[1]	= *vx1;
+		ws.vxheap[2]	= *vx2;
+
+		vxlastheap[0] = &ws.vxheap[0];
+		vxlastheap[1] = &ws.vxheap[1];
+		vxlastheap[2] = &ws.vxheap[2];
+		vxlastheap[3] = NULL;
+
+		vxheapnext = ws.vxheap + 3;
+
+		//	Current		Next		Action
+		//	-------		----		------
+		//	Unclipped	Unclipped	Copy vertex
+		//	Unclipped	Clipped		Copy vertex and add intersection
+		//	Clipped		Unclipped	Add intersection
+		//	Clipped		Clipped		No action
+
+#define	DOCLIP(cliptype, _sign_, cliparg)				\
+		if (orflags & k##cliptype) {					\
+			VDTriBltTransformedVertex **src = vxlastheap;		\
+			VDTriBltTransformedVertex **dst = vxnextheap;		\
+														\
+			while(*src) {								\
+				VDTriBltTransformedVertex *cur = *src;			\
+				VDTriBltTransformedVertex *next = src[1];		\
+														\
+				if (!next)								\
+					next = vxlastheap[0];				\
+														\
+				if (!(cur->outcode & k##cliptype))	\
+					*dst++ = cur;						\
+														\
+				if ((cur->outcode ^ next->outcode) & k##cliptype) {	\
+					double alpha = (cur->w _sign_ cur->cliparg) / ((cur->w _sign_ cur->cliparg) - (next->w _sign_ next->cliparg));	\
+														\
+					if (alpha >= 0.0 && alpha <= 1.0) {	\
+						vxheapnext->interp(cur, next, (float)alpha);	\
+						vxheapnext->cliparg = -(_sign_ vxheapnext->w);	\
+						*dst++ = vxheapnext++;			\
+					}									\
+				}										\
+				++src;									\
+			}											\
+			*dst = NULL;								\
+			if (dst < vxnextheap+3) return NULL;		\
+			src = vxlastheap; vxlastheap = vxnextheap; vxnextheap = src;	\
+		}
+
+
+		DOCLIP(Far, -, z);
+		DOCLIP(Near, +, z);
+		DOCLIP(Bottom, -, y);
+		DOCLIP(Top, +, y);
+		DOCLIP(Right, -, x);
+		DOCLIP(Left, +, x);
+
+#undef DOCLIP
+
+		return vxlastheap;
+	}
+
+	void RenderClippedTri(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+							const VDTriBltTransformedVertex *vx0,
+							const VDTriBltTransformedVertex *vx1,
+							const VDTriBltTransformedVertex *vx2,
+							VDTriBltFilterMode filterMode,
+							float mipMapLODBias,
+							int orflags)
+	{
+
+		VDTriBltTransformedVertex *vxheapnext;
+		VDTriBltTransformedVertex vxheap[21];
+
+		VDTriBltTransformedVertex *vxheapptr[2][19];
+		VDTriBltTransformedVertex **vxlastheap = vxheapptr[0], **vxnextheap = vxheapptr[1];
+
+		vxheap[0]	= *vx0;
+		vxheap[1]	= *vx1;
+		vxheap[2]	= *vx2;
+
+		vxlastheap[0] = &vxheap[0];
+		vxlastheap[1] = &vxheap[1];
+		vxlastheap[2] = &vxheap[2];
+		vxlastheap[3] = NULL;
+
+		vxheapnext = vxheap + 3;
+
+		//	Current		Next		Action
+		//	-------		----		------
+		//	Unclipped	Unclipped	Copy vertex
+		//	Unclipped	Clipped		Copy vertex and add intersection
+		//	Clipped		Unclipped	Add intersection
+		//	Clipped		Clipped		No action
+
+#define	DOCLIP(cliptype, _sign_, cliparg)				\
+		if (orflags & k##cliptype) {					\
+			VDTriBltTransformedVertex **src = vxlastheap;		\
+			VDTriBltTransformedVertex **dst = vxnextheap;		\
+														\
+			while(*src) {								\
+				VDTriBltTransformedVertex *cur = *src;			\
+				VDTriBltTransformedVertex *next = src[1];		\
+														\
+				if (!next)								\
+					next = vxlastheap[0];				\
+														\
+				if (!(cur->outcode & k##cliptype))	\
+					*dst++ = cur;						\
+														\
+				if ((cur->outcode ^ next->outcode) & k##cliptype) {	\
+					double alpha = (cur->w _sign_ cur->cliparg) / ((cur->w _sign_ cur->cliparg) - (next->w _sign_ next->cliparg));	\
+														\
+					if (alpha >= 0.0 && alpha <= 1.0) {	\
+						vxheapnext->interp(cur, next, (float)alpha);	\
+						vxheapnext->cliparg = -(_sign_ vxheapnext->w);	\
+						*dst++ = vxheapnext++;			\
+					}									\
+				}										\
+				++src;									\
+			}											\
+			*dst = NULL;								\
+			if (dst < vxnextheap+3) return;				\
+			src = vxlastheap; vxlastheap = vxnextheap; vxnextheap = src;	\
+		}
+
+
+		DOCLIP(Far, -, z);
+		DOCLIP(Near, +, z);
+		DOCLIP(Bottom, -, y);
+		DOCLIP(Top, +, y);
+		DOCLIP(Right, -, x);
+		DOCLIP(Left, +, x);
+
+#undef DOCLIP
+
+		VDTriBltTransformedVertex **src = vxlastheap+1;
+
+		while(src[1]) {
+			RenderTri(dst, pSources, nMipmaps, vxlastheap[0], src[0], src[1], filterMode, mipMapLODBias);
+			++src;
+		}
+	}
+
+}
+
+bool VDPixmapTriFill(VDPixmap& dst, const uint32 c, const VDTriBltVertex *pVertices, int nVertices, const int *pIndices, int nIndices, const float pTransform[16]) {
+	if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+		return false;
+
+	static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+	vdfastvector<VDTriBltTransformedVertex>	xverts(nVertices);
+
+	if (!pTransform)
+		pTransform = xf_ident;
+
+	TransformVerts(xverts.data(), pVertices, nVertices, pTransform);
+
+	const VDTriBltTransformedVertex *xsrc = xverts.data();
+
+	VDTriClipWorkspace clipws;
+
+	while(nIndices >= 3) {
+		const int idx0 = pIndices[0];
+		const int idx1 = pIndices[1];
+		const int idx2 = pIndices[2];
+		const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+		const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+		const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+		const int kode0 = xv0->outcode;
+		const int kode1 = xv1->outcode;
+		const int kode2 = xv2->outcode;
+
+		if (!(kode0 & kode1 & kode2)) {
+			if (int orflags = kode0 | kode1 | kode2) {
+				VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+				if (src) {
+					VDTriBltTransformedVertex *src0 = *src++;
+
+					// fan out triangles
+					while(src[1]) {
+						FillTri(dst, c, src0, src[0], src[1]);
+						++src;
+					}
+				}
+			} else
+				FillTri(dst, c, xv0, xv1, xv2);
+		}
+
+		pIndices += 3;
+		nIndices -= 3;
+	}
+
+	return true;
+}
+
+bool VDPixmapTriFill(VDPixmap& dst, const VDTriColorVertex *pVertices, int nVertices, const int *pIndices, int nIndices, const float pTransform[16]) {
+	VDPixmap pxY;
+	VDPixmap pxCb;
+	VDPixmap pxCr;
+	bool ycbcr = false;
+	float ycbcr_xoffset = 0;
+
+	switch(dst.format) {
+	case nsVDPixmap::kPixFormat_XRGB8888:
+	case nsVDPixmap::kPixFormat_Y8:
+		break;
+	case nsVDPixmap::kPixFormat_YUV444_Planar:
+	case nsVDPixmap::kPixFormat_YUV422_Planar:
+	case nsVDPixmap::kPixFormat_YUV420_Planar:
+	case nsVDPixmap::kPixFormat_YUV410_Planar:
+		pxY.format = nsVDPixmap::kPixFormat_Y8;
+		pxY.data = dst.data;
+		pxY.pitch = dst.pitch;
+		pxY.w = dst.w;
+		pxY.h = dst.h;
+
+		pxCb.format = nsVDPixmap::kPixFormat_Y8;
+		pxCb.data = dst.data2;
+		pxCb.pitch = dst.pitch2;
+		pxCb.h = dst.h;
+
+		pxCr.format = nsVDPixmap::kPixFormat_Y8;
+		pxCr.data = dst.data3;
+		pxCr.pitch = dst.pitch3;
+		pxCr.h = dst.h;
+
+		if (dst.format == nsVDPixmap::kPixFormat_YUV410_Planar) {
+			pxCr.w = pxCb.w = dst.w >> 2;
+			pxCr.h = pxCb.h = dst.h >> 2;
+			ycbcr_xoffset = 0.75f / (float)pxCr.w;
+		} else if (dst.format == nsVDPixmap::kPixFormat_YUV420_Planar) {
+			pxCr.w = pxCb.w = dst.w >> 1;
+			pxCr.h = pxCb.h = dst.h >> 1;
+			ycbcr_xoffset = 0.5f / (float)pxCr.w;
+		} else if (dst.format == nsVDPixmap::kPixFormat_YUV422_Planar) {
+			pxCr.w = pxCb.w = dst.w >> 1;
+			ycbcr_xoffset = 0.5f / (float)pxCr.w;
+		} else if (dst.format == nsVDPixmap::kPixFormat_YUV444_Planar) {
+			pxCr.w = pxCb.w = dst.w;
+			ycbcr_xoffset = 0.0f;
+		}
+
+		ycbcr = true;
+		break;
+	default:
+		return false;
+	}
+
+	VDTriBltTransformedVertex fastxverts[64];
+	vdfastvector<VDTriBltTransformedVertex>	xverts;
+
+	VDTriBltTransformedVertex *xsrc;
+	if (nVertices <= 64) {
+		xsrc = fastxverts;
+	} else {
+		xverts.resize(nVertices);
+		xsrc = xverts.data();
+	}
+
+	static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+	if (!pTransform)
+		pTransform = xf_ident;
+
+	VDTriClipWorkspace clipws;
+	for(int plane=0; plane<(ycbcr?3:1); ++plane) {
+		VDPixmap& pxPlane = ycbcr ? plane == 0 ? pxY : plane == 1 ? pxCb : pxCr : dst;
+
+		if (ycbcr && plane) {
+			float xf_ycbcr[16];
+			memcpy(xf_ycbcr, pTransform, sizeof(float) * 16);
+
+			// translate in x by ycbcr_xoffset
+			xf_ycbcr[0] += xf_ycbcr[12]*ycbcr_xoffset;
+			xf_ycbcr[1] += xf_ycbcr[13]*ycbcr_xoffset;
+			xf_ycbcr[2] += xf_ycbcr[14]*ycbcr_xoffset;
+			xf_ycbcr[3] += xf_ycbcr[15]*ycbcr_xoffset;
+
+			TransformVerts(xsrc, pVertices, nVertices, xf_ycbcr);
+
+			switch(plane) {
+				case 1:
+					for(int i=0; i<nVertices; ++i)
+						xsrc[i].g = xsrc[i].b;
+					break;
+				case 2:
+					for(int i=0; i<nVertices; ++i)
+						xsrc[i].g = xsrc[i].r;
+					break;
+			}
+		} else {
+			TransformVerts(xsrc, pVertices, nVertices, pTransform);
+		}
+
+		const int *nextIndex = pIndices;
+		int indicesLeft = nIndices;
+		while(indicesLeft >= 3) {
+			const int idx0 = nextIndex[0];
+			const int idx1 = nextIndex[1];
+			const int idx2 = nextIndex[2];
+			const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+			const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+			const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+			const int kode0 = xv0->outcode;
+			const int kode1 = xv1->outcode;
+			const int kode2 = xv2->outcode;
+
+			if (!(kode0 & kode1 & kode2)) {
+				if (int orflags = kode0 | kode1 | kode2) {
+					VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+					if (src) {
+						VDTriBltTransformedVertex *src0 = *src++;
+
+						// fan out triangles
+						while(src[1]) {
+							FillTriGrad(pxPlane, src0, src[0], src[1]);
+							++src;
+						}
+					}
+				} else {
+					FillTriGrad(pxPlane, xv0, xv1, xv2);
+				}
+			}
+
+			nextIndex += 3;
+			indicesLeft -= 3;
+		}
+	}
+
+	return true;
+}
+
+bool VDPixmapTriBlt(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+					const VDTriBltVertex *pVertices, int nVertices,
+					const int *pIndices, int nIndices,
+					VDTriBltFilterMode filterMode,
+					float mipMapLODBias,
+					const float pTransform[16])
+{
+	if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+		return false;
+
+	static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+	vdfastvector<VDTriBltTransformedVertex>	xverts(nVertices);
+
+	if (!pTransform)
+		pTransform = xf_ident;
+
+	TransformVerts(xverts.data(), pVertices, nVertices, pTransform);
+
+	const VDTriBltTransformedVertex *xsrc = xverts.data();
+
+	VDTriClipWorkspace clipws;
+
+	while(nIndices >= 3) {
+		const int idx0 = pIndices[0];
+		const int idx1 = pIndices[1];
+		const int idx2 = pIndices[2];
+		const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+		const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+		const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+		const int kode0 = xv0->outcode;
+		const int kode1 = xv1->outcode;
+		const int kode2 = xv2->outcode;
+
+		if (!(kode0 & kode1 & kode2)) {
+			if (int orflags = kode0 | kode1 | kode2) {
+				VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+				if (src) {
+					VDTriBltTransformedVertex *src0 = *src++;
+
+					// fan out triangles
+					while(src[1]) {
+						RenderTri(dst, pSources, nMipmaps, src0, src[0], src[1], filterMode, mipMapLODBias);
+						++src;
+					}
+				}
+			} else
+				RenderTri(dst, pSources, nMipmaps, xv0, xv1, xv2, filterMode, mipMapLODBias);
+		}
+
+		pIndices += 3;
+		nIndices -= 3;
+	}
+
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+void VDPixmapSetTextureBorders(VDPixmap& px, bool wrap) {
+	const int w = px.w;
+	const int h = px.h;
+
+	VDPixmapBlt(px, 0,   1,   px, wrap ? w-2 : 1, 1,              1, h-2);
+	VDPixmapBlt(px, w-1, 1,   px, wrap ? 1 : w-2, 1,              1, h-2);
+
+	VDPixmapBlt(px, 0,   0,   px, 0,              wrap ? h-2 : 1, w, 1);
+	VDPixmapBlt(px, 0,   h-1, px, 0,              wrap ? 1 : h-2, w, 1);
+}
+
+void VDPixmapSetTextureBordersCubic(VDPixmap& px) {
+	const int w = px.w;
+	const int h = px.h;
+
+	VDPixmapBlt(px, 0,   1, px, 2, 1, 1, h-2);
+	VDPixmapBlt(px, 1,   1, px, 2, 1, 1, h-2);
+	VDPixmapBlt(px, w-2, 1, px, w-3, 1, 1, h-2);
+	VDPixmapBlt(px, w-1, 1, px, w-3, 1, 1, h-2);
+
+	VDPixmapBlt(px, 0, 0,   px, 0, 2, w, 1);
+	VDPixmapBlt(px, 0, 1,   px, 0, 2, w, 1);
+	VDPixmapBlt(px, 0, h-2, px, 0, h-3, w, 1);
+	VDPixmapBlt(px, 0, h-1, px, 0, h-3, w, 1);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapTextureMipmapChain::VDPixmapTextureMipmapChain(const VDPixmap& src, bool wrap, bool cubic, int maxlevels) {
+	int w = src.w;
+	int h = src.h;
+	int mipcount = 0;
+
+	while((w>1 || h>1) && maxlevels--) {
+		++mipcount;
+		w >>= 1;
+		h >>= 1;
+	}
+
+	mBuffers.resize(mipcount);
+	mMipMaps.resize(mipcount);
+
+	vdautoptr<IVDPixmapResampler> r(VDCreatePixmapResampler());
+	r->SetFilters(IVDPixmapResampler::kFilterLinear, IVDPixmapResampler::kFilterLinear, false);
+
+	float fw = (float)src.w;
+	float fh = (float)src.h;
+	for(int mip=0; mip<mipcount; ++mip) {
+		const int mipw = VDCeilToInt(fw);
+		const int miph = VDCeilToInt(fh);
+
+		mMipMaps[mip] = &mBuffers[mip];
+
+		if (cubic) {
+			mBuffers[mip].init(mipw+4, miph+4, nsVDPixmap::kPixFormat_XRGB8888);
+
+			if (!mip) {
+				VDPixmapBlt(mBuffers[0], 2, 2, src, 0, 0, src.w, src.h);
+				VDPixmapSetTextureBordersCubic(mBuffers[0]);
+			} else {
+				const VDPixmap& curmip = mBuffers[mip];
+				const VDPixmap& prevmip = mBuffers[mip-1];
+
+				vdrect32f rdst( 0.0f,  0.0f,      (float)curmip.w       ,      (float)curmip.h       );
+				vdrect32f rsrc(-2.0f, -2.0f, 2.0f*(float)curmip.w - 2.0f, 2.0f*(float)curmip.h - 2.0f);
+				r->Init(rdst, curmip.w, curmip.h, curmip.format, rsrc, prevmip.w, prevmip.h, prevmip.format);
+				r->Process(curmip, prevmip);
+			}
+		} else {
+			mBuffers[mip].init(mipw+2, miph+2, nsVDPixmap::kPixFormat_XRGB8888);
+
+			if (!mip) {
+				VDPixmapBlt(mBuffers[0], 1, 1, src, 0, 0, src.w, src.h);
+				VDPixmapSetTextureBorders(mBuffers[0], wrap);
+			} else {
+				const VDPixmap& curmip = mBuffers[mip];
+				const VDPixmap& prevmip = mBuffers[mip-1];
+
+				vdrect32f rdst( 0.0f,  0.0f,      (float)curmip.w       ,      (float)curmip.h       );
+				vdrect32f rsrc(-1.0f, -1.0f, 2.0f*(float)curmip.w - 1.0f, 2.0f*(float)curmip.h - 1.0f);
+				r->Init(rdst, curmip.w, curmip.h, curmip.format, rsrc, prevmip.w, prevmip.h, prevmip.format);
+				r->Process(curmip, prevmip);
+			}
+		}
+
+		fw *= 0.5f;
+		fh *= 0.5f;
+	}
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp
new file mode 100644
index 000000000..6dc1b4334
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp
@@ -0,0 +1,903 @@
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "uberblit.h"
+#include "uberblit_gen.h"
+
+uint32 VDPixmapGetFormatTokenFromFormat(int format) {
+	using namespace nsVDPixmap;
+	switch(format) {
+	case kPixFormat_Pal1:			return kVDPixType_1 | kVDPixSamp_444 | kVDPixSpace_Pal;
+	case kPixFormat_Pal2:			return kVDPixType_2 | kVDPixSamp_444 | kVDPixSpace_Pal;
+	case kPixFormat_Pal4:			return kVDPixType_4 | kVDPixSamp_444 | kVDPixSpace_Pal;
+	case kPixFormat_Pal8:			return kVDPixType_8 | kVDPixSamp_444 | kVDPixSpace_Pal;
+	case kPixFormat_XRGB1555:		return kVDPixType_1555_LE | kVDPixSamp_444 | kVDPixSpace_BGR;
+	case kPixFormat_RGB565:			return kVDPixType_565_LE | kVDPixSamp_444 | kVDPixSpace_BGR;
+	case kPixFormat_RGB888:			return kVDPixType_888 | kVDPixSamp_444 | kVDPixSpace_BGR;
+	case kPixFormat_XRGB8888:		return kVDPixType_8888 | kVDPixSamp_444 | kVDPixSpace_BGR;
+	case kPixFormat_Y8:				return kVDPixType_8 | kVDPixSamp_444 | kVDPixSpace_Y_601;
+	case kPixFormat_YUV422_UYVY:	return kVDPixType_B8G8_R8G8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_YUYV:	return kVDPixType_G8B8_G8R8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV444_XVYU:	return kVDPixType_8888 | kVDPixSamp_444 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV444_Planar:	return kVDPixType_8_8_8 | kVDPixSamp_444 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_Planar:	return kVDPixType_8_8_8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_Planar_16F:	return kVDPixType_16F_16F_16F_LE | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV420_Planar:	return kVDPixType_8_8_8 | kVDPixSamp_420_MPEG2 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV411_Planar:	return kVDPixType_8_8_8 | kVDPixSamp_411 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV410_Planar:	return kVDPixType_8_8_8 | kVDPixSamp_410 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_Planar_Centered:	return kVDPixType_8_8_8 | kVDPixSamp_422_JPEG | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV420_Planar_Centered:	return kVDPixType_8_8_8 | kVDPixSamp_420_MPEG1 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_V210:	return kVDPixType_V210 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+	case kPixFormat_YUV422_UYVY_709:	return kVDPixType_B8G8_R8G8 | kVDPixSamp_422 | kVDPixSpace_YCC_709;
+	case kPixFormat_YUV420_NV12:	return kVDPixType_8_B8R8 | kVDPixSamp_420_MPEG2 | kVDPixSpace_YCC_601;
+	default:
+		VDASSERT(false);
+		return 0;
+	}
+}
+
+const VDPixmapSamplingInfo& VDPixmapGetSamplingInfo(uint32 samplingToken) {
+	static const VDPixmapSamplingInfo kPixmapSamplingInfo[]={
+		/* Null			*/ {  0,  0,  0,  0,  0 },
+		/* 444			*/ {  0,  0,  0,  0,  0 },
+		/* 422			*/ { -4,  0,  0,  1,  0 },
+		/* 422_JPEG		*/ {  0,  0,  0,  1,  0 },
+		/* 420_MPEG2	*/ { -4,  0,  0,  1,  1 },
+		/* 420_MPEG2INT	*/ { -4,  0,  0,  1,  1 },
+		/* 420_MPEG1	*/ {  0,  0,  0,  1,  1 },
+		/* 420_DVPAL	*/ { -4,  0,  0,  1,  1 },
+		/* 411			*/ { -6,  0,  0,  2,  0 },
+		/* 410			*/ { -6,  0,  0,  2,  2 }
+	};
+
+	uint32 index = (samplingToken & kVDPixSamp_Mask) >> kVDPixSamp_Bits;
+
+	return index >= sizeof(kPixmapSamplingInfo)/sizeof(kPixmapSamplingInfo[0]) ? kPixmapSamplingInfo[0] : kPixmapSamplingInfo[index];
+}
+
+namespace {
+	uint32 BlitterConvertSampling(VDPixmapUberBlitterGenerator& gen, uint32 srcToken, uint32 dstSamplingToken, sint32 w, sint32 h) {
+		// if the source type is 16F, we have to convert to 32F
+		if ((srcToken & kVDPixType_Mask) == kVDPixType_16F_16F_16F_LE) {
+			// 0 1 2
+			gen.conv_16F_to_32F();
+			gen.swap(1);
+			// 1 0 2
+			gen.conv_16F_to_32F();
+			gen.swap(2);
+			// 2 0 1
+			gen.conv_16F_to_32F();
+			gen.swap(2);
+			gen.swap(1);
+			srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+		}
+
+		// look up sampling info
+		const VDPixmapSamplingInfo& srcInfo = VDPixmapGetSamplingInfo(srcToken);
+		const VDPixmapSamplingInfo& dstInfo = VDPixmapGetSamplingInfo(dstSamplingToken);
+
+		// convert destination chroma origin to luma space
+		int c_x = ((8 + dstInfo.mCXOffset16) << dstInfo.mCXBits) - 8;
+		int cr_y = ((8 + dstInfo.mCrYOffset16) << dstInfo.mCYBits) - 8;
+		int cb_y = ((8 + dstInfo.mCbYOffset16) << dstInfo.mCYBits) - 8;
+
+		// convert luma chroma location to source chroma space
+		c_x = ((8 + c_x) >> srcInfo.mCXBits) - 8 - srcInfo.mCXOffset16;
+		cr_y = ((8 + cr_y) >> srcInfo.mCYBits) - 8 - srcInfo.mCrYOffset16;
+		cb_y = ((8 + cb_y) >> srcInfo.mCYBits) - 8 - srcInfo.mCbYOffset16;
+
+		float cxo = c_x / 16.0f + 0.5f;
+		float cxf = ((16 << dstInfo.mCXBits) >> srcInfo.mCXBits) / 16.0f;
+		float cyf = ((16 << dstInfo.mCYBits) >> srcInfo.mCYBits) / 16.0f;
+		sint32 cw = -(-w >> dstInfo.mCXBits);
+		sint32 ch = -(-h >> dstInfo.mCYBits);
+
+		gen.swap(2);
+		gen.linear(cxo, cxf, cw, cb_y / 16.0f + 0.5f, cyf, ch);
+		gen.swap(2);
+		gen.linear(cxo, cxf, cw, cr_y / 16.0f + 0.5f, cyf, ch);
+
+		return (srcToken & ~kVDPixSamp_Mask) | (dstSamplingToken & kVDPixSamp_Mask);
+	}
+
+	uint32 BlitterConvertType(VDPixmapUberBlitterGenerator& gen, uint32 srcToken, uint32 dstToken, sint32 w, sint32 h) {
+		uint32 dstType = dstToken & kVDPixType_Mask;
+
+		while((srcToken ^ dstToken) & kVDPixType_Mask) {
+			uint32 srcType = srcToken & kVDPixType_Mask;
+			uint32 targetType = dstType;
+
+	type_reconvert:
+			switch(targetType) {
+				case kVDPixType_1555_LE:
+					switch(srcType) {
+						case kVDPixType_565_LE:
+							gen.conv_565_to_555();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_1555_LE;
+							break;
+
+						case kVDPixType_8888:
+							gen.conv_8888_to_555();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_1555_LE;
+							break;
+						case kVDPixType_B8G8_R8G8:
+						case kVDPixType_G8B8_G8R8:
+							targetType = kVDPixType_8_8_8;
+							goto type_reconvert;
+						default:
+							targetType = kVDPixType_8888;
+							goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_565_LE:
+					switch(srcType) {
+						case kVDPixType_1555_LE:
+							gen.conv_555_to_565();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_565_LE;
+							break;
+						case kVDPixType_8888:
+							gen.conv_8888_to_565();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_565_LE;
+							break;
+						case kVDPixType_B8G8_R8G8:
+						case kVDPixType_G8B8_G8R8:
+							targetType = kVDPixType_8_8_8;
+							goto type_reconvert;
+						default:
+							targetType = kVDPixType_8888;
+							goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_888:
+					switch(srcType) {
+						case kVDPixType_8888:
+							gen.conv_8888_to_888();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_888;
+							break;
+						default:
+							targetType = kVDPixType_8888;
+							goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_8888:
+					switch(srcType) {
+						case kVDPixType_1555_LE:
+							gen.conv_555_to_8888();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+							break;
+						case kVDPixType_565_LE:
+							gen.conv_565_to_8888();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+							break;
+						case kVDPixType_888:
+							gen.conv_888_to_8888();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+							break;
+						case kVDPixType_32Fx4_LE:
+							gen.conv_X32F_to_8888();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+							break;
+						case kVDPixType_8_8_8:
+							if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_444)
+								srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_444, w, h);
+							gen.interleave_X8R8G8B8();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+							break;
+						default:
+							VDASSERT(false);
+							break;
+					}
+					break;
+
+				case kVDPixType_8:
+					switch(srcType) {
+						case kVDPixType_8_8_8:
+							gen.pop();
+							gen.swap(1);
+							gen.pop();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+							break;
+
+						case kVDPixType_16F_LE:
+							targetType = kVDPixType_32F_LE;
+							goto type_reconvert;
+
+						case kVDPixType_32F_LE:
+							gen.conv_32F_to_8();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+							break;
+
+						default:
+							targetType = kVDPixType_8_8_8;
+							goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_8_8_8:
+					switch(srcType) {
+						case kVDPixType_B8G8_R8G8:
+							gen.dup();
+							gen.dup();
+							gen.extract_8in32(2, (w + 1) >> 1, h);
+							gen.swap(2);
+							gen.extract_8in16(1, w, h);
+							gen.swap(1);
+							gen.extract_8in32(0, (w + 1) >> 1, h);
+							srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_8_8_8 | kVDPixSamp_422;
+							break;
+						case kVDPixType_G8B8_G8R8:
+							gen.dup();
+							gen.dup();
+							gen.extract_8in32(3, (w + 1) >> 1, h);
+							gen.swap(2);
+							gen.extract_8in16(0, w, h);
+							gen.swap(1);
+							gen.extract_8in32(1, (w + 1) >> 1, h);
+							srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_8_8_8 | kVDPixSamp_422;
+							break;
+						case kVDPixType_16F_16F_16F_LE:
+						case kVDPixType_V210:
+							targetType = kVDPixType_32F_32F_32F_LE;
+							goto type_reconvert;
+						case kVDPixType_32F_32F_32F_LE:
+							// 0 1 2
+							gen.conv_32F_to_8();
+							gen.swap(1);
+							// 1 0 2
+							gen.conv_32F_to_8();
+							gen.swap(2);
+							// 2 0 1
+							gen.conv_32F_to_8();
+							gen.swap(2);
+							gen.swap(1);
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+							break;
+						case kVDPixType_8_B8R8:
+							{
+								const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+								int cw = -(-w >> sampInfo.mCXBits);
+								int ch = -(-h >> sampInfo.mCYBits);
+
+								gen.dup();
+								gen.extract_8in16(1, cw, ch);
+								gen.swap(2);
+								gen.swap(1);
+								gen.extract_8in16(0, cw, ch);
+								srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+							}
+							break;
+						default:
+							VDASSERT(false);
+							break;
+					}
+					break;
+
+				case kVDPixType_B8G8_R8G8:
+					switch(srcType) {
+					case kVDPixType_8_8_8:
+						if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+							srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+
+						gen.interleave_B8G8_R8G8();
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_B8G8_R8G8;
+						break;
+					case kVDPixType_G8B8_G8R8:
+						gen.swap_8in16(w, h, w*2);
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_B8G8_R8G8;
+						break;
+					default:
+						targetType = kVDPixType_8_8_8;
+						goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_G8B8_G8R8:
+					switch(srcType) {
+					case kVDPixType_8_8_8:
+						if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+							srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+
+						gen.interleave_G8B8_G8R8();
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_G8B8_G8R8;
+						break;
+					case kVDPixType_B8G8_R8G8:
+						gen.swap_8in16(w, h, w*2);
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_G8B8_G8R8;
+						break;
+					default:
+						targetType = kVDPixType_8_8_8;
+						goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_16F_16F_16F_LE:
+					switch(srcType) {
+						case kVDPixType_32F_32F_32F_LE:
+							// 0 1 2
+							gen.conv_32F_to_16F();
+							gen.swap(1);
+							// 1 0 2
+							gen.conv_32F_to_16F();
+							gen.swap(2);
+							// 2 0 1
+							gen.conv_32F_to_16F();
+							gen.swap(2);
+							gen.swap(1);
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_16F_16F_LE;
+							break;
+
+						default:
+							targetType = kVDPixType_32F_32F_32F_LE;
+							goto type_reconvert;
+					}
+					break;
+
+				case kVDPixType_32F_32F_32F_LE:
+					switch(srcType) {
+						case kVDPixType_8_8_8:
+							// 0 1 2
+							gen.conv_8_to_32F();
+							gen.swap(1);
+							// 1 0 2
+							gen.conv_8_to_32F();
+							gen.swap(2);
+							// 2 0 1
+							gen.conv_8_to_32F();
+							gen.swap(2);
+							gen.swap(1);
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+							break;
+
+						case kVDPixType_16F_16F_16F_LE:
+							// 0 1 2
+							gen.conv_16F_to_32F();
+							gen.swap(1);
+							// 1 0 2
+							gen.conv_16F_to_32F();
+							gen.swap(2);
+							// 2 0 1
+							gen.conv_16F_to_32F();
+							gen.swap(2);
+							gen.swap(1);
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+							break;
+
+						case kVDPixType_B8G8_R8G8:
+						case kVDPixType_G8B8_G8R8:
+						case kVDPixType_8_B8R8:
+							targetType = kVDPixType_8_8_8;
+							goto type_reconvert;
+
+						case kVDPixType_V210:
+							gen.conv_V210_to_32F();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+							break;
+
+						default:
+							VDASSERT(false);
+					}
+					break;
+
+				case kVDPixType_V210:
+					switch(srcType) {
+						case kVDPixType_32F_32F_32F_LE:
+							if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_422)
+								srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_422, w, h);
+
+							gen.conv_32F_to_V210();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_V210;
+							break;
+
+						case kVDPixType_16F_16F_16F_LE:
+							targetType = kVDPixType_32F_32F_32F_LE;
+							goto type_reconvert;
+
+						case kVDPixType_8_8_8:
+							if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_422)
+								srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_422, w, h);
+
+							targetType = kVDPixType_32F_32F_32F_LE;
+							goto type_reconvert;
+
+						case kVDPixType_B8G8_R8G8:
+						case kVDPixType_G8B8_G8R8:
+						case kVDPixType_8_B8R8:
+							targetType = kVDPixType_8_8_8;
+							goto type_reconvert;
+
+						default:
+							VDASSERT(false);
+					}
+					break;
+
+				case kVDPixType_32F_LE:
+					switch(srcType) {
+						case kVDPixType_8:
+							gen.conv_8_to_32F();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+							break;
+						case kVDPixType_16F_LE:
+							gen.conv_16F_to_32F();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+							break;
+						default:
+							VDASSERT(false);
+					}
+					break;
+
+				case kVDPixType_8_B8R8:
+					switch(srcType) {
+						case kVDPixType_8_8_8:
+							gen.swap(1);
+							gen.swap(2);
+							gen.interleave_B8R8();
+							srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_B8R8;
+							break;
+						default:
+							VDASSERT(false);
+							break;
+					}
+					break;
+
+				default:
+					VDASSERT(false);
+					break;
+			}
+		}
+
+		return srcToken;
+	}
+}
+
+IVDPixmapBlitter *VDPixmapCreateBlitter(const VDPixmap& dst, const VDPixmap& src) {
+	const VDPixmapLayout& dstlayout = VDPixmapToLayoutFromBase(dst, dst.data);
+	const VDPixmapLayout& srclayout = VDPixmapToLayoutFromBase(src, src.data);
+
+	return VDPixmapCreateBlitter(dstlayout, srclayout);
+}
+
+IVDPixmapBlitter *VDPixmapCreateBlitter(const VDPixmapLayout& dst, const VDPixmapLayout& src) {
+	if (src.format == dst.format) {
+		return VDCreatePixmapUberBlitterDirectCopy(dst, src);
+	}
+
+	uint32 srcToken = VDPixmapGetFormatTokenFromFormat(src.format);
+	uint32 dstToken = VDPixmapGetFormatTokenFromFormat(dst.format);
+
+	VDPixmapUberBlitterGenerator gen;
+
+	// load source channels
+	int w = src.w;
+	int h = src.h;
+
+	switch(srcToken & kVDPixType_Mask) {
+	case kVDPixType_1:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 7) >> 3);
+		break;
+
+	case kVDPixType_2:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 3) >> 2);
+		break;
+
+	case kVDPixType_4:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 1) >> 1);
+		break;
+
+	case kVDPixType_8:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+		break;
+
+	case kVDPixType_555_LE:
+	case kVDPixType_565_LE:
+	case kVDPixType_1555_LE:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*2);
+		break;
+
+	case kVDPixType_888:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*3);
+		break;
+
+	case kVDPixType_8888:
+	case kVDPixType_32F_LE:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*4);
+		break;
+
+	case kVDPixType_32Fx4_LE:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*16);
+		break;
+
+	case kVDPixType_B8G8_R8G8:
+	case kVDPixType_G8B8_G8R8:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, ((w + 1) & ~1)*2);
+		break;
+
+	case kVDPixType_8_8_8:
+		{
+			uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+			uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+			uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+
+			const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+			int cxbits = sampInfo.mCXBits;
+			int cybits = sampInfo.mCYBits;
+			int w2 = -(-w >> cxbits);
+			int h2 = -(-h >> cybits);
+			gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2);
+			gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+			gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2);
+		}
+		break;
+
+	case kVDPixType_16F_16F_16F_LE:
+		{
+			uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+			uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+			uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+
+			const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+			int cxbits = sampInfo.mCXBits;
+			int cybits = sampInfo.mCYBits;
+			int w2 = -(-w >> cxbits);
+			int h2 = -(-h >> cybits);
+			gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2 * 2);
+			gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*2);
+			gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2 * 2);
+		}
+		break;
+
+	case kVDPixType_32F_32F_32F_LE:
+		{
+			uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+			uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+			uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+
+			const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+			int cxbits = sampInfo.mCXBits;
+			int cybits = sampInfo.mCYBits;
+			int w2 = -(-w >> cxbits);
+			int h2 = -(-h >> cybits);
+			gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2 * 4);
+			gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*4);
+			gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2 * 4);
+		}
+		break;
+
+	case kVDPixType_V210:
+		gen.ldsrc(0, 0, 0, 0, w, h, srcToken, ((w + 5) / 6) * 4);
+		break;
+
+	case kVDPixType_8_B8R8:
+		{
+			uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+			uint32 ctoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_B8R8;
+
+			const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+			int cxbits = sampInfo.mCXBits;
+			int cybits = sampInfo.mCYBits;
+			int w2 = -(-w >> cxbits);
+			int h2 = -(-h >> cybits);
+			gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+			gen.ldsrc(0, 1, 0, 0, w2, h2, ctoken, w2*2);
+		}
+		break;
+
+	default:
+		VDASSERT(false);
+	}
+
+	// check if we need a color space change
+	if ((srcToken ^ dstToken) & kVDPixSpace_Mask) {
+		// first, if we're dealing with an interleaved format, deinterleave it
+		switch(srcToken & kVDPixType_Mask) {
+		case kVDPixType_B8G8_R8G8:
+			gen.dup();
+			gen.dup();
+			gen.extract_8in32(2, (w + 1) >> 1, h);
+			gen.swap(2);
+			gen.extract_8in16(1, w, h);
+			gen.swap(1);
+			gen.extract_8in32(0, (w + 1) >> 1, h);
+			srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+			break;
+
+		case kVDPixType_G8B8_G8R8:
+			gen.dup();
+			gen.dup();
+			gen.extract_8in32(3, (w + 1) >> 1, h);
+			gen.swap(2);
+			gen.extract_8in16(0, w, h);
+			gen.swap(1);
+			gen.extract_8in32(1, (w + 1) >> 1, h);
+			srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+			break;
+
+		case kVDPixType_8_B8R8:
+			gen.dup();
+			gen.extract_8in16(1, (w + 1) >> 1, (h + 1) >> 1);
+			gen.swap(2);
+			gen.swap(1);
+			gen.extract_8in16(0, (w + 1) >> 1, (h + 1) >> 1);
+			srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+			break;
+
+		case kVDPixType_V210:
+			gen.conv_V210_to_32F();
+			srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+			break;
+		}
+
+		// if the source is subsampled, converge on 4:4:4 subsampling, but only if we actually need
+		// the auxiliary channels
+		const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+		if ((dstToken & kVDPixSpace_Mask) != kVDPixSpace_Y_601 && (dstToken & kVDPixSpace_Mask) != kVDPixSpace_Y_709) {
+			if (sampInfo.mCXBits | sampInfo.mCYBits | sampInfo.mCXOffset16 | sampInfo.mCbYOffset16 | sampInfo.mCrYOffset16)
+				srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_444, w, h);
+		}
+
+		// change color spaces
+		uint32 dstSpace = dstToken & kVDPixSpace_Mask;
+		while((srcToken ^ dstToken) & kVDPixSpace_Mask) {
+			uint32 srcSpace = srcToken & kVDPixSpace_Mask;
+			uint32 targetSpace = dstSpace;
+
+space_reconvert:
+			switch(targetSpace) {
+				case kVDPixSpace_BGR:
+					switch(srcSpace) {
+					case kVDPixSpace_YCC_709:
+						switch(srcToken & kVDPixType_Mask) {
+							case kVDPixType_8_8_8:
+								gen.ycbcr709_to_rgb32();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							case kVDPixType_16F_16F_16F_LE:
+								srcToken = BlitterConvertType(gen, srcToken, kVDPixType_32F_32F_32F_LE, w, h);
+								gen.ycbcr709_to_rgb32_32f();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+								break;
+
+							case kVDPixType_32F_32F_32F_LE:
+								gen.ycbcr709_to_rgb32_32f();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+								break;
+
+							default:
+								VDASSERT(false);
+								break;
+						}
+						break;
+
+					case kVDPixSpace_YCC_601:
+						switch(srcToken & kVDPixType_Mask) {
+							case kVDPixType_8_8_8:
+								gen.ycbcr601_to_rgb32();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							case kVDPixType_16F_16F_16F_LE:
+								srcToken = BlitterConvertType(gen, srcToken, kVDPixType_32F_32F_32F_LE, w, h);
+								gen.ycbcr601_to_rgb32_32f();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+								break;
+
+							case kVDPixType_32F_32F_32F_LE:
+								gen.ycbcr601_to_rgb32_32f();
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+								break;
+
+							default:
+								VDASSERT(false);
+								break;
+						}
+						break;
+
+					case kVDPixSpace_Y_601:
+						targetSpace = kVDPixSpace_YCC_601;
+						goto space_reconvert;
+
+					case kVDPixSpace_Pal:
+						switch(srcToken & kVDPixType_Mask) {
+							case kVDPixType_1:
+								gen.conv_Pal1_to_8888(0);
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							case kVDPixType_2:
+								gen.conv_Pal2_to_8888(0);
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							case kVDPixType_4:
+								gen.conv_Pal4_to_8888(0);
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							case kVDPixType_8:
+								gen.conv_Pal8_to_8888(0);
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+								break;
+
+							default:
+								VDASSERT(false);
+								break;
+						}
+						break;
+
+					default:
+						VDASSERT(false);
+						break;
+					}
+					break;
+				case kVDPixSpace_Y_601:
+					if (srcSpace == kVDPixSpace_YCC_601) {
+						gen.pop();
+						gen.swap(1);
+						gen.pop();
+						switch(srcToken & kVDPixType_Mask) {
+							case kVDPixType_32F_32F_32F_LE:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_32F_LE;
+								break;
+							case kVDPixType_16F_16F_16F_LE:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_16F_LE;
+								break;
+							case kVDPixType_8_8_8:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_8;
+								break;
+
+							default:
+								VDASSERT(false);
+						}
+						srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8, w, h);
+						break;
+					} else if (srcSpace == kVDPixSpace_YCC_709) {
+						gen.pop();
+						gen.swap(1);
+						gen.pop();
+						switch(srcToken & kVDPixType_Mask) {
+							case kVDPixType_32F_32F_32F_LE:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_32F_LE;
+								break;
+							case kVDPixType_16F_16F_16F_LE:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_16F_LE;
+								break;
+							case kVDPixType_8_8_8:
+								srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_8;
+								break;
+
+							default:
+								VDASSERT(false);
+						}
+						srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8, w, h);
+						break;
+					}
+					// fall through
+				case kVDPixSpace_YCC_601:
+					switch(srcSpace) {
+					case kVDPixSpace_BGR:
+						srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8888, w, h);
+						gen.rgb32_to_ycbcr601();
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_601 | kVDPixType_8_8_8;
+						break;
+					case kVDPixSpace_Y_601:
+					case kVDPixSpace_Y_709:
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_601 | kVDPixType_8;
+
+						{
+							const VDPixmapSamplingInfo& sinfo = VDPixmapGetSamplingInfo(dstToken);
+							int cw = ((w - 1) >> sinfo.mCXBits) + 1;
+							int ch = ((h - 1) >> sinfo.mCYBits) + 1;
+
+							gen.ldconst(0x80, cw, cw, ch, srcToken);
+						}
+
+						gen.dup();
+						gen.swap(2);
+						gen.swap(1);
+						srcToken = kVDPixSpace_YCC_601 | kVDPixType_8_8_8 | (dstToken & kVDPixSamp_Mask);
+						break;
+					case kVDPixSpace_YCC_709:
+						VDASSERT((srcToken & kVDPixType_Mask) == kVDPixType_8_8_8);
+						gen.ycbcr709_to_ycbcr601();
+						srcToken = (srcToken & ~kVDPixSpace_Mask) | kVDPixSpace_YCC_601;
+						break;
+
+					case kVDPixSpace_Pal:
+						targetSpace = kVDPixSpace_BGR;
+						goto space_reconvert;
+
+					default:
+						VDASSERT(false);
+						break;
+					}
+					break;
+				case kVDPixSpace_YCC_709:
+					switch(srcSpace) {
+					case kVDPixSpace_BGR:
+						srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8888, w, h);
+						gen.rgb32_to_ycbcr709();
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_709 | kVDPixType_8_8_8;
+						break;
+					case kVDPixSpace_Y_709:
+					case kVDPixSpace_Y_601:
+						srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_709 | kVDPixType_8;
+
+						{
+							const VDPixmapSamplingInfo& sinfo = VDPixmapGetSamplingInfo(dstToken);
+							int cw = ((w - 1) >> sinfo.mCXBits) + 1;
+							int ch = ((h - 1) >> sinfo.mCYBits) + 1;
+
+							gen.ldconst(0x80, cw, cw, ch, srcToken);
+						}
+
+						gen.dup();
+						gen.swap(2);
+						gen.swap(1);
+						srcToken = kVDPixSpace_YCC_709 | kVDPixType_8_8_8 | (dstToken & kVDPixSamp_Mask);
+						break;
+					case kVDPixSpace_YCC_601:
+						VDASSERT((srcToken & kVDPixType_Mask) == kVDPixType_8_8_8 || (srcToken & kVDPixType_Mask) == kVDPixType_32F_32F_32F_LE);
+						gen.ycbcr601_to_ycbcr709();
+						srcToken = (srcToken & ~kVDPixSpace_Mask) | kVDPixSpace_YCC_709;
+						break;
+					case kVDPixSpace_Pal:
+						targetSpace = kVDPixSpace_BGR;
+						goto space_reconvert;
+					default:
+						VDASSERT(false);
+						break;
+					}
+					break;
+
+				default:
+					VDASSERT(false);
+					break;
+			}
+		}
+	}
+
+	// check if we need a type change
+	//
+	// Note: If the sampling is also different, we have to be careful about what types we
+	// target. The type conversion may itself involve a sampling conversion, so things get
+	// VERY tricky here.
+	if ((srcToken ^ dstToken) & kVDPixType_Mask) {
+		bool samplingDifferent = 0 != ((srcToken ^ dstToken) & kVDPixSamp_Mask);
+		uint32 intermediateTypeToken = dstToken & kVDPixType_Mask;
+
+		if (samplingDifferent) {
+			switch(dstToken & kVDPixType_Mask) {
+				case kVDPixType_16F_16F_16F_LE:
+					intermediateTypeToken = kVDPixType_32F_32F_32F_LE;
+					break;
+				case kVDPixType_8_B8R8:
+					intermediateTypeToken = kVDPixType_8_8_8;
+					break;
+			}
+		}
+
+		srcToken = BlitterConvertType(gen, srcToken, (dstToken & ~kVDPixType_Mask) | intermediateTypeToken, w, h);
+	}
+
+	// convert subsampling if necessary
+	switch(srcToken & kVDPixType_Mask) {
+		case kVDPixType_8_8_8:
+		case kVDPixType_16F_16F_16F_LE:
+		case kVDPixType_32F_32F_32F_LE:
+			if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+				srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+			break;
+	}
+
+	// check if we need a type change (possible with 16F)
+	srcToken = BlitterConvertType(gen, srcToken, dstToken, w, h);
+
+	return gen.create();
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp
new file mode 100644
index 000000000..3e9af1a1b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp
@@ -0,0 +1,40 @@
+#include <vd2/system/halffloat.h>
+#include "uberblit_16f.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_32F_To_16F::Start() {
+	StartWindow(mWidth * sizeof(uint16));
+}
+
+uint32 VDPixmapGen_32F_To_16F::GetType(uint32 output) const {
+	return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+}
+
+void VDPixmapGen_32F_To_16F::Compute(void *dst0, sint32 y) {
+	uint16 *dst = (uint16 *)dst0;
+	const float *src = (const float *)mpSrc->GetRow(y, mSrcIndex);
+	uint32 w = mWidth;
+
+	for(uint32 i=0; i<w; ++i)
+		*dst++ = VDConvertFloatToHalf(src++);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_16F_To_32F::Start() {
+	StartWindow(mWidth * sizeof(float));
+}
+
+uint32 VDPixmapGen_16F_To_32F::GetType(uint32 output) const {
+	return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+}
+
+void VDPixmapGen_16F_To_32F::Compute(void *dst0, sint32 y) {
+	float *dst = (float *)dst0;
+	const uint16 *src = (const uint16 *)mpSrc->GetRow(y, mSrcIndex);
+	uint32 w = mWidth;
+
+	for(uint32 i=0; i<w; ++i)
+		VDConvertHalfToFloat(*src++, dst++);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp
new file mode 100644
index 000000000..f93ca322e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp
@@ -0,0 +1,1597 @@
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "uberblit.h"
+#include "uberblit_gen.h"
+#include "uberblit_fill.h"
+#include "uberblit_input.h"
+#include "uberblit_resample.h"
+#include "uberblit_resample_special.h"
+#include "uberblit_ycbcr.h"
+#include "uberblit_rgb.h"
+#include "uberblit_swizzle.h"
+#include "uberblit_pal.h"
+#include "uberblit_16f.h"
+#include "uberblit_v210.h"
+
+#ifdef VD_CPU_X86
+	#include "uberblit_swizzle_x86.h"
+	#include "uberblit_ycbcr_x86.h"
+	#include "uberblit_rgb_x86.h"
+	#include "uberblit_resample_special_x86.h"
+#endif
+
+void VDPixmapGenerate(void *dst, ptrdiff_t pitch, sint32 bpr, sint32 height, IVDPixmapGen *gen, int genIndex) {
+	for(sint32 y=0; y<height; ++y) {
+		memcpy(dst, gen->GetRow(y, genIndex), bpr);
+		vdptrstep(dst, pitch);
+	}
+	VDCPUCleanupExtensions();
+}
+
+void VDPixmapGenerateFast(void *dst, ptrdiff_t pitch, sint32 height, IVDPixmapGen *gen) {
+	for(sint32 y=0; y<height; ++y) {
+		gen->ProcessRow(dst, y);
+		vdptrstep(dst, pitch);
+	}
+	VDCPUCleanupExtensions();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+IVDPixmapBlitter *VDCreatePixmapUberBlitterDirectCopy(const VDPixmap& dst, const VDPixmap& src) {
+	return new VDPixmapUberBlitterDirectCopy;
+}
+
+IVDPixmapBlitter *VDCreatePixmapUberBlitterDirectCopy(const VDPixmapLayout& dst, const VDPixmapLayout& src) {
+	return new VDPixmapUberBlitterDirectCopy;
+}
+
+VDPixmapUberBlitterDirectCopy::VDPixmapUberBlitterDirectCopy() {
+}
+
+VDPixmapUberBlitterDirectCopy::~VDPixmapUberBlitterDirectCopy() {
+}
+
+void VDPixmapUberBlitterDirectCopy::Blit(const VDPixmap& dst, const VDPixmap& src) {
+	Blit(dst, NULL, src);
+}
+
+void VDPixmapUberBlitterDirectCopy::Blit(const VDPixmap& dst, const vdrect32 *rDst, const VDPixmap& src) {
+	VDASSERT(dst.format == src.format);
+
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+
+	void *p = dst.data;
+	void *p2 = dst.data2;
+	void *p3 = dst.data3;
+	int w = dst.w;
+	int h = dst.h;
+
+	if (formatInfo.qchunky)  {
+		w = (w + formatInfo.qw - 1) / formatInfo.qw;
+		h = -(-h >> formatInfo.qhbits);
+	}
+
+	int w2 = -(-dst.w >> formatInfo.auxwbits);
+	int h2 = -(-dst.h >> formatInfo.auxhbits);
+
+	if (rDst) {
+		int x1 = rDst->left;
+		int y1 = rDst->top;
+		int x2 = rDst->right;
+		int y2 = rDst->bottom;
+
+		VDASSERT(x1 >= 0 && y1 >= 0 && x2 <= w && y2 <= h && x2 >= x1 && y2 >= y1);
+
+		if (x2 < x1 || y2 < y1)
+			return;
+
+		p = vdptroffset(dst.data, dst.pitch * y1 + x1 * formatInfo.qsize);
+		w = x2 - x1;
+		h = y2 - y1;
+
+		if (formatInfo.auxbufs >= 1) {
+			VDASSERT(!((x1|x2) & ((1 << formatInfo.auxwbits) - 1)));
+			VDASSERT(!((y1|y2) & ((1 << formatInfo.auxhbits) - 1)));
+
+			int ax1 = x1 >> formatInfo.auxwbits;
+			int ay1 = y1 >> formatInfo.auxhbits;
+			int ax2 = x2 >> formatInfo.auxwbits;
+			int ay2 = y2 >> formatInfo.auxhbits;
+
+			p2 = vdptroffset(dst.data2, dst.pitch2 * ay1 + ax1);
+			w2 = ax2 - ax1;
+			h2 = ay2 - ay1;
+
+			if (formatInfo.auxbufs >= 2)
+				p3 = vdptroffset(dst.data3, dst.pitch3 * ay1 + ax1);
+		}
+	}
+
+	uint32 bpr = formatInfo.qsize * w;
+
+	VDMemcpyRect(p, dst.pitch, src.data, src.pitch, bpr, h);
+
+	if (formatInfo.auxbufs >= 1) {
+		VDMemcpyRect(p2, dst.pitch2, src.data2, src.pitch2, w2 * formatInfo.auxsize, h2);
+
+		if (formatInfo.auxbufs >= 2)
+			VDMemcpyRect(p3, dst.pitch3, src.data3, src.pitch3, w2 * formatInfo.auxsize, h2);
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+VDPixmapUberBlitter::VDPixmapUberBlitter() {
+}
+
+VDPixmapUberBlitter::~VDPixmapUberBlitter() {
+	while(!mGenerators.empty()) {
+		delete mGenerators.back();
+		mGenerators.pop_back();
+	}
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const VDPixmap& src) {
+	Blit(dst, NULL, src);
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const vdrect32 *rDst, const VDPixmap& src) {
+	for(Sources::const_iterator it(mSources.begin()), itEnd(mSources.end()); it!=itEnd; ++it) {
+		const SourceEntry& se = *it;
+		const void *p;
+		ptrdiff_t pitch;
+
+		switch(se.mSrcPlane) {
+			case 0:
+				p = src.data;
+				pitch = src.pitch;
+				break;
+			case 1:
+				p = src.data2;
+				pitch = src.pitch2;
+				break;
+			case 2:
+				p = src.data3;
+				pitch = src.pitch3;
+				break;
+			default:
+				VDASSERT(false);
+				break;
+		}
+
+		se.mpSrc->SetSource((const char *)p + pitch*se.mSrcY + se.mSrcX, pitch, src.palette);
+	}
+
+	if (mOutputs[2].mpSrc) {
+		if (mbIndependentPlanes)
+			Blit3Separated(dst, rDst);
+		else if (mbIndependentChromaPlanes)
+			Blit3Split(dst, rDst);
+		else
+			Blit3(dst, rDst);
+	} else if (mOutputs[1].mpSrc) {
+		if (mbIndependentPlanes)
+			Blit2Separated(dst, rDst);
+		else
+			Blit2(dst, rDst);
+	} else
+		Blit(dst, rDst);
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+
+	mOutputs[0].mpSrc->AddWindowRequest(0, 0);
+	mOutputs[0].mpSrc->Start();
+
+	void *p = dst.data;
+	int w = dst.w;
+	int h = dst.h;
+
+	if (formatInfo.qchunky) {
+		w = (w + formatInfo.qw - 1) / formatInfo.qw;
+		h = -(-h >> formatInfo.qhbits);
+	}
+
+	if (rDst) {
+		int x1 = rDst->left;
+		int y1 = rDst->top;
+		int x2 = rDst->right;
+		int y2 = rDst->bottom;
+
+		if (formatInfo.qchunky) {
+			x1 = x1 / formatInfo.qw;
+			y1 = y1 / formatInfo.qh;
+			x2 = (x2 + formatInfo.qw - 1) / formatInfo.qw;
+			y2 = (y2 + formatInfo.qh - 1) / formatInfo.qh;
+		}
+
+		VDASSERT(x1 >= 0 && y1 >= 0 && x2 <= w && y2 <= h && x2 >= x1 && y2 >= y1);
+
+		if (x2 < x1 || y2 < y1)
+			return;
+
+		p = vdptroffset(dst.data, dst.pitch * y1 + x1 * formatInfo.qsize);
+		w = x2 - x1;
+		h = y2 - y1;
+	}
+
+	uint32 bpr = formatInfo.qsize * w;
+
+	if (mOutputs[0].mSrcIndex == 0)
+		VDPixmapGenerateFast(p, dst.pitch, h, mOutputs[0].mpSrc);
+	else
+		VDPixmapGenerate(p, dst.pitch, bpr, h, mOutputs[0].mpSrc, mOutputs[0].mSrcIndex);
+}
+
+void VDPixmapUberBlitter::Blit3(const VDPixmap& px, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+	IVDPixmapGen *gen = mOutputs[1].mpSrc;
+	int idx = mOutputs[1].mSrcIndex;
+	IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+	int idx1 = mOutputs[2].mSrcIndex;
+	IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+	int idx2 = mOutputs[0].mSrcIndex;
+
+	gen->AddWindowRequest(0, 0);
+	gen->Start();
+	gen1->AddWindowRequest(0, 0);
+	gen1->Start();
+	gen2->AddWindowRequest(0, 0);
+	gen2->Start();
+
+	uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+	uint32 auxaccum = 0;
+
+	auxstep += auxstep;
+
+	int qw = px.w;
+	int qh = px.h;
+
+	if (formatInfo.qchunky) {
+		qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+		qh = -(-qh >> formatInfo.qhbits);
+	}
+
+	uint32 height = qh;
+	uint32 bpr = formatInfo.qsize * qw;
+	uint32 bpr2 = formatInfo.auxsize * -(-px.w >> formatInfo.auxwbits);
+	uint8 *dst = (uint8 *)px.data;
+	uint8 *dst2 = (uint8 *)px.data2;
+	uint8 *dst3 = (uint8 *)px.data3;
+	ptrdiff_t pitch = px.pitch;
+	ptrdiff_t pitch2 = px.pitch2;
+	ptrdiff_t pitch3 = px.pitch3;
+	uint32 y2 = 0;
+	for(uint32 y=0; y<height; ++y) {
+		memcpy(dst, gen->GetRow(y, idx), bpr);
+		vdptrstep(dst, pitch);
+
+		if (!auxaccum) {
+			memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+			vdptrstep(dst2, pitch2);
+			memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+			vdptrstep(dst3, pitch3);
+			++y2;
+		}
+
+		auxaccum += auxstep;
+	}
+
+	VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit3Split(const VDPixmap& px, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+	IVDPixmapGen *gen = mOutputs[1].mpSrc;
+	int idx = mOutputs[1].mSrcIndex;
+	IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+	int idx1 = mOutputs[2].mSrcIndex;
+	IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+	int idx2 = mOutputs[0].mSrcIndex;
+
+	gen->AddWindowRequest(0, 0);
+	gen->Start();
+	gen1->AddWindowRequest(0, 0);
+	gen1->Start();
+	gen2->AddWindowRequest(0, 0);
+	gen2->Start();
+
+	uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+	uint32 auxaccum = 0;
+
+	auxstep += auxstep;
+
+	int qw = px.w;
+	int qh = px.h;
+
+	if (formatInfo.qchunky) {
+		qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+		qh = -(-qh >> formatInfo.qhbits);
+	}
+
+	uint32 height = qh;
+	uint32 bpr = formatInfo.qsize * qw;
+	uint8 *dst = (uint8 *)px.data;
+	ptrdiff_t pitch = px.pitch;
+
+	if (idx == 0) {
+		for(uint32 y=0; y<height; ++y) {
+			gen->ProcessRow(dst, y);
+			vdptrstep(dst, pitch);
+		}
+	} else {
+		for(uint32 y=0; y<height; ++y) {
+			memcpy(dst, gen->GetRow(y, idx), bpr);
+			vdptrstep(dst, pitch);
+		}
+	}
+
+	uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+	uint8 *dst2 = (uint8 *)px.data2;
+	uint8 *dst3 = (uint8 *)px.data3;
+	ptrdiff_t pitch2 = px.pitch2;
+	ptrdiff_t pitch3 = px.pitch3;
+	uint32 y2 = 0;
+	for(uint32 y=0; y<height; ++y) {
+		if (!auxaccum) {
+			memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+			vdptrstep(dst2, pitch2);
+			memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+			vdptrstep(dst3, pitch3);
+			++y2;
+		}
+
+		auxaccum += auxstep;
+	}
+
+	VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit3Separated(const VDPixmap& px, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+	IVDPixmapGen *gen = mOutputs[1].mpSrc;
+	int idx = mOutputs[1].mSrcIndex;
+	IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+	int idx1 = mOutputs[2].mSrcIndex;
+	IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+	int idx2 = mOutputs[0].mSrcIndex;
+
+	gen->AddWindowRequest(0, 0);
+	gen->Start();
+	gen1->AddWindowRequest(0, 0);
+	gen1->Start();
+	gen2->AddWindowRequest(0, 0);
+	gen2->Start();
+
+	int qw = px.w;
+	int qh = px.h;
+
+	if (formatInfo.qchunky) {
+		qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+		qh = -(-qh >> formatInfo.qhbits);
+	}
+
+	uint32 height = qh;
+	uint32 bpr = formatInfo.qsize * qw;
+	uint8 *dst = (uint8 *)px.data;
+	ptrdiff_t pitch = px.pitch;
+
+	if (idx == 0) {
+		for(uint32 y=0; y<height; ++y) {
+			gen->ProcessRow(dst, y);
+			vdptrstep(dst, pitch);
+		}
+	} else {
+		for(uint32 y=0; y<height; ++y) {
+			memcpy(dst, gen->GetRow(y, idx), bpr);
+			vdptrstep(dst, pitch);
+		}
+	}
+
+	uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+	uint32 h2 = -(-px.h >> formatInfo.auxhbits);
+	uint8 *dst2 = (uint8 *)px.data2;
+	ptrdiff_t pitch2 = px.pitch2;
+	if (idx1 == 0) {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			gen1->ProcessRow(dst2, y2);
+			vdptrstep(dst2, pitch2);
+		}
+	} else {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+			vdptrstep(dst2, pitch2);
+		}
+	}
+
+	uint8 *dst3 = (uint8 *)px.data3;
+	ptrdiff_t pitch3 = px.pitch3;
+	if (idx2 == 0) {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			gen2->ProcessRow(dst3, y2);
+			vdptrstep(dst3, pitch3);
+		}
+	} else {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+			vdptrstep(dst3, pitch3);
+		}
+	}
+
+	VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit2(const VDPixmap& px, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+	IVDPixmapGen *gen = mOutputs[0].mpSrc;
+	int idx = mOutputs[0].mSrcIndex;
+	IVDPixmapGen *gen1 = mOutputs[1].mpSrc;
+	int idx1 = mOutputs[1].mSrcIndex;
+
+	gen->AddWindowRequest(0, 0);
+	gen->Start();
+	gen1->AddWindowRequest(0, 0);
+	gen1->Start();
+
+	uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+	uint32 auxaccum = 0;
+
+	auxstep += auxstep;
+
+	int qw = px.w;
+	int qh = px.h;
+
+	if (formatInfo.qchunky) {
+		qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+		qh = -(-qh >> formatInfo.qhbits);
+	}
+
+	uint32 height = qh;
+	uint32 bpr = formatInfo.qsize * qw;
+	uint32 bpr2 = formatInfo.auxsize * -(-px.w >> formatInfo.auxwbits);
+	uint8 *dst = (uint8 *)px.data;
+	uint8 *dst2 = (uint8 *)px.data2;
+	ptrdiff_t pitch = px.pitch;
+	ptrdiff_t pitch2 = px.pitch2;
+	uint32 y2 = 0;
+	for(uint32 y=0; y<height; ++y) {
+		memcpy(dst, gen->GetRow(y, idx), bpr);
+		vdptrstep(dst, pitch);
+
+		if (!auxaccum) {
+			memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+			vdptrstep(dst2, pitch2);
+			++y2;
+		}
+
+		auxaccum += auxstep;
+	}
+
+	VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit2Separated(const VDPixmap& px, const vdrect32 *rDst) {
+	const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+	IVDPixmapGen *gen = mOutputs[0].mpSrc;
+	int idx = mOutputs[0].mSrcIndex;
+	IVDPixmapGen *gen1 = mOutputs[1].mpSrc;
+	int idx1 = mOutputs[1].mSrcIndex;
+
+	gen->AddWindowRequest(0, 0);
+	gen->Start();
+	gen1->AddWindowRequest(0, 0);
+	gen1->Start();
+
+	int qw = px.w;
+	int qh = px.h;
+
+	if (formatInfo.qchunky) {
+		qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+		qh = -(-qh >> formatInfo.qhbits);
+	}
+
+	uint32 height = qh;
+	uint32 bpr = formatInfo.qsize * qw;
+	uint8 *dst = (uint8 *)px.data;
+	ptrdiff_t pitch = px.pitch;
+
+	if (idx == 0) {
+		for(uint32 y=0; y<height; ++y) {
+			gen->ProcessRow(dst, y);
+			vdptrstep(dst, pitch);
+		}
+	} else {
+		for(uint32 y=0; y<height; ++y) {
+			memcpy(dst, gen->GetRow(y, idx), bpr);
+			vdptrstep(dst, pitch);
+		}
+	}
+
+	uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+	uint32 h2 = -(-px.h >> formatInfo.auxhbits);
+	uint8 *dst2 = (uint8 *)px.data2;
+	ptrdiff_t pitch2 = px.pitch2;
+	if (idx1 == 0) {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			gen1->ProcessRow(dst2, y2);
+			vdptrstep(dst2, pitch2);
+		}
+	} else {
+		for(uint32 y2=0; y2<h2; ++y2) {
+			memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+			vdptrstep(dst2, pitch2);
+		}
+	}
+
+	VDCPUCleanupExtensions();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+VDPixmapUberBlitterGenerator::VDPixmapUberBlitterGenerator() {
+}
+
+VDPixmapUberBlitterGenerator::~VDPixmapUberBlitterGenerator() {
+	while(!mGenerators.empty()) {
+		delete mGenerators.back();
+		mGenerators.pop_back();
+	}
+}
+
+void VDPixmapUberBlitterGenerator::swap(int index) {
+	std::swap(mStack.back(), (&mStack.back())[-index]);
+}
+
+void VDPixmapUberBlitterGenerator::dup() {
+	mStack.push_back(mStack.back());
+}
+
+void VDPixmapUberBlitterGenerator::pop() {
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ldsrc(int srcIndex, int srcPlane, int x, int y, uint32 w, uint32 h, uint32 type, uint32 bpr) {
+	VDPixmapGenSrc *src = new VDPixmapGenSrc;
+
+	src->Init(w, h, type, bpr);
+
+	mGenerators.push_back(src);
+	mStack.push_back(StackEntry(src, 0));
+
+	SourceEntry se;
+	se.mpSrc = src;
+	se.mSrcIndex = srcIndex;
+	se.mSrcPlane = srcPlane;
+	se.mSrcX = x;
+	se.mSrcY = y;
+	mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::ldconst(uint8 fill, uint32 bpr, uint32 w, uint32 h, uint32 type) {
+	VDPixmapGenFill8 *src = new VDPixmapGenFill8;
+
+	src->Init(fill, bpr, w, h, type);
+
+	mGenerators.push_back(src);
+	mStack.push_back(StackEntry(src, 0));
+}
+
+void VDPixmapUberBlitterGenerator::extract_8in16(int offset, uint32 w, uint32 h) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_8In16 *src = NULL;
+	
+#if VD_CPU_X86
+	if (MMX_enabled) {
+		if (offset == 0)
+			src = new VDPixmapGen_8In16_Even_MMX;
+		else if (offset == 1)
+			src = new VDPixmapGen_8In16_Odd_MMX;
+	}
+#endif
+	if (!src)
+		src = new VDPixmapGen_8In16;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, offset, w, h);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::extract_8in32(int offset, uint32 w, uint32 h) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_8In32 *src = NULL;
+
+#if VD_CPU_X86
+	if (MMX_enabled) {
+		if ((unsigned)offset < 4)
+			src = new VDPixmapGen_8In32_MMX;
+	}
+#endif
+
+	if (!src)
+		src = new VDPixmapGen_8In32;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, offset, w, h);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::swap_8in16(uint32 w, uint32 h, uint32 bpr) {
+	StackEntry *args = &mStack.back();
+
+#if VD_CPU_X86
+	VDPixmapGen_Swap8In16 *src = MMX_enabled ? new VDPixmapGen_Swap8In16_MMX : new VDPixmapGen_Swap8In16;
+#else
+	VDPixmapGen_Swap8In16 *src = new VDPixmapGen_Swap8In16;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, w, h, bpr);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal1_to_8888(int srcIndex) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_Pal1_To_X8R8G8B8 *src = new VDPixmapGen_Pal1_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+
+	SourceEntry se;
+	se.mpSrc = src;
+	se.mSrcIndex = srcIndex;
+	se.mSrcPlane = 0;
+	se.mSrcX = 0;
+	se.mSrcY = 0;
+	mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal2_to_8888(int srcIndex) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_Pal2_To_X8R8G8B8 *src = new VDPixmapGen_Pal2_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+
+	SourceEntry se;
+	se.mpSrc = src;
+	se.mSrcIndex = srcIndex;
+	se.mSrcPlane = 0;
+	se.mSrcX = 0;
+	se.mSrcY = 0;
+	mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal4_to_8888(int srcIndex) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_Pal4_To_X8R8G8B8 *src = new VDPixmapGen_Pal4_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+
+	SourceEntry se;
+	se.mpSrc = src;
+	se.mSrcIndex = srcIndex;
+	se.mSrcPlane = 0;
+	se.mSrcX = 0;
+	se.mSrcY = 0;
+	mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal8_to_8888(int srcIndex) {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_Pal8_To_X8R8G8B8 *src = new VDPixmapGen_Pal8_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+
+	SourceEntry se;
+	se.mpSrc = src;
+	se.mSrcIndex = srcIndex;
+	se.mSrcPlane = 0;
+	se.mSrcX = 0;
+	se.mSrcY = 0;
+	mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::pointh(float xoffset, float xfactor, uint32 w) {
+	StackEntry *args = &mStack.back();
+
+	if (xoffset != 0.5f || xfactor != 1.0f) {
+		VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterPoint, 0, false);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::pointv(float yoffset, float yfactor, uint32 h) {
+	StackEntry *args = &mStack.back();
+
+	if (yoffset != 0.5f || yfactor != 1.0f) {
+		VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterPoint, 0, false);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::linearh(float xoffset, float xfactor, uint32 w, bool interpOnly) {
+	StackEntry *args = &mStack.back();
+	IVDPixmapGen *src = args[0].mpSrc;
+	int srcIndex = args[0].mSrcIndex;
+
+	sint32 srcw = src->GetWidth(srcIndex);
+	if (xoffset == 0.5f && xfactor == 1.0f && srcw == w)
+		return;
+
+	if (xoffset == 0.5f && (src->GetType(srcIndex) & kVDPixType_Mask) == kVDPixType_8) {
+		if (xfactor == 2.0f && w == ((srcw + 1) >> 1)) {
+			VDPixmapGenResampleRow_d2_p0_lin_u8 *out = new VDPixmapGenResampleRow_d2_p0_lin_u8;
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (xfactor == 4.0f && w == ((srcw + 3) >> 2)) {
+			VDPixmapGenResampleRow_d4_p0_lin_u8 *out = new VDPixmapGenResampleRow_d4_p0_lin_u8;
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (xfactor == 0.5f && w == srcw*2) {
+#if VD_CPU_X86
+			VDPixmapGenResampleRow_x2_p0_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleRow_x2_p0_lin_u8_ISSE : new VDPixmapGenResampleRow_x2_p0_lin_u8;
+#else
+			VDPixmapGenResampleRow_x2_p0_lin_u8 *out = new VDPixmapGenResampleRow_x2_p0_lin_u8;
+#endif
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (xfactor == 0.25f && w == srcw*4) {
+#if VD_CPU_X86
+			VDPixmapGenResampleRow_x4_p0_lin_u8 *out = MMX_enabled ? new VDPixmapGenResampleRow_x4_p0_lin_u8_MMX : new VDPixmapGenResampleRow_x4_p0_lin_u8;
+#else
+			VDPixmapGenResampleRow_x4_p0_lin_u8 *out = new VDPixmapGenResampleRow_x4_p0_lin_u8;
+#endif
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+	}
+
+	VDPixmapGenResampleRow *out = new VDPixmapGenResampleRow;
+
+	out->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterLinear, 0, interpOnly);
+
+	mGenerators.push_back(out);
+	MarkDependency(out, src);
+	args[0] = StackEntry(out, 0);
+}
+
+void VDPixmapUberBlitterGenerator::linearv(float yoffset, float yfactor, uint32 h, bool interpOnly) {
+	StackEntry *args = &mStack.back();
+	IVDPixmapGen *src = args[0].mpSrc;
+	int srcIndex = args[0].mSrcIndex;
+
+	sint32 srch = src->GetHeight(srcIndex);
+	if (yoffset == 0.5f && yfactor == 1.0f && srch == h)
+		return;
+
+	if ((src->GetType(srcIndex) & kVDPixType_Mask) == kVDPixType_8) {
+		if (yoffset == 1.0f && yfactor == 2.0f && h == ((srch + 1) >> 1)) {
+			VDPixmapGenResampleCol_x2_phalf_lin_u8 *out = new VDPixmapGenResampleCol_x2_phalf_lin_u8;
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (yoffset == 2.0f && yfactor == 4.0f && h == ((srch + 2) >> 2)) {
+			VDPixmapGenResampleCol_x4_p1half_lin_u8 *out = new VDPixmapGenResampleCol_x4_p1half_lin_u8;
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (yoffset == 0.25f && yfactor == 0.5f && h == srch*2) {
+#if VD_CPU_X86
+			VDPixmapGenResampleCol_d2_pnqrtr_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8_ISSE : new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8;
+#else
+			VDPixmapGenResampleCol_d2_pnqrtr_lin_u8 *out = new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8;
+#endif
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+
+		if (yoffset == 0.125f && yfactor == 0.25f && h == srch*4) {
+#if VD_CPU_X86
+			VDPixmapGenResampleCol_d4_pn38_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleCol_d4_pn38_lin_u8_ISSE : new VDPixmapGenResampleCol_d4_pn38_lin_u8;
+#else
+			VDPixmapGenResampleCol_d4_pn38_lin_u8 *out = new VDPixmapGenResampleCol_d4_pn38_lin_u8;
+#endif
+
+			out->Init(src, srcIndex);
+			mGenerators.push_back(out);
+			MarkDependency(out, src);
+			args[0] = StackEntry(out, 0);
+			return;
+		}
+	}
+
+	VDPixmapGenResampleCol *out = new VDPixmapGenResampleCol;
+
+	out->Init(src, srcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterLinear, 0, interpOnly);
+
+	mGenerators.push_back(out);
+	MarkDependency(out, src);
+	args[0] = StackEntry(out, 0);
+}
+
+void VDPixmapUberBlitterGenerator::linear(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h) {
+	linearh(xoffset, xfactor, w, false);
+	linearv(yoffset, yfactor, h, false);
+}
+
+void VDPixmapUberBlitterGenerator::cubich(float xoffset, float xfactor, uint32 w, float splineFactor, bool interpOnly) {
+	StackEntry *args = &mStack.back();
+
+	if (xoffset != 0.5f || xfactor != 1.0f) {
+		VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterCubic, splineFactor, interpOnly);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::cubicv(float yoffset, float yfactor, uint32 h, float splineFactor, bool interpOnly) {
+	StackEntry *args = &mStack.back();
+
+	if (yoffset != 0.5f || yfactor != 1.0f) {
+		VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterCubic, splineFactor, interpOnly);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::cubic(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h, float splineFactor) {
+	cubich(xoffset, xfactor, w, splineFactor, false);
+	cubicv(yoffset, yfactor, h, splineFactor, false);
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3h(float xoffset, float xfactor, uint32 w) {
+	StackEntry *args = &mStack.back();
+
+	if (xoffset != 0.5f || xfactor != 1.0f) {
+		VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterLanczos3, 0, false);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3v(float yoffset, float yfactor, uint32 h) {
+	StackEntry *args = &mStack.back();
+
+	if (yoffset != 0.5f || yfactor != 1.0f) {
+		VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+		src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterLanczos3, 0, false);
+
+		mGenerators.push_back(src);
+		MarkDependency(src, args[0].mpSrc);
+		args[0] = StackEntry(src, 0);
+	}
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h) {
+	lanczos3h(xoffset, xfactor, w);
+	lanczos3v(yoffset, yfactor, h);
+}
+
+void VDPixmapUberBlitterGenerator::conv_555_to_8888() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_X1R5G5B5_To_X8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_X1R5G5B5_To_X8R8G8B8_MMX : new VDPixmapGen_X1R5G5B5_To_X8R8G8B8;
+#else
+	VDPixmapGen_X1R5G5B5_To_X8R8G8B8 *src = new VDPixmapGen_X1R5G5B5_To_X8R8G8B8;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_565_to_8888() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_R5G6B5_To_X8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_R5G6B5_To_X8R8G8B8_MMX : new VDPixmapGen_R5G6B5_To_X8R8G8B8;
+#else
+	VDPixmapGen_R5G6B5_To_X8R8G8B8 *src = new VDPixmapGen_R5G6B5_To_X8R8G8B8;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_888_to_8888() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_R8G8B8_To_A8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_R8G8B8_To_X8R8G8B8_MMX : new VDPixmapGen_R8G8B8_To_A8R8G8B8;
+#else
+	VDPixmapGen_R8G8B8_To_A8R8G8B8 *src = new VDPixmapGen_R8G8B8_To_A8R8G8B8;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8_to_32F() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_8_To_32F *src = new VDPixmapGen_8_To_32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_16F_to_32F() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_16F_To_32F *src = new VDPixmapGen_16F_To_32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_V210_to_32F() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_V210_To_32F *src = new VDPixmapGen_V210_To_32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.push_back(StackEntry(src, 1));
+	mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_X32F() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_X8R8G8B8_To_X32B32G32R32F *src = new VDPixmapGen_X8R8G8B8_To_X32B32G32R32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_555() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_X8R8G8B8_To_X1R5G5B5 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_X1R5G5B5_MMX : new VDPixmapGen_X8R8G8B8_To_X1R5G5B5;
+#else
+	VDPixmapGen_X8R8G8B8_To_X1R5G5B5 *src = new VDPixmapGen_X8R8G8B8_To_X1R5G5B5;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_555_to_565() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_X1R5G5B5_To_R5G6B5 *src = MMX_enabled ? new VDPixmapGen_X1R5G5B5_To_R5G6B5_MMX : new VDPixmapGen_X1R5G5B5_To_R5G6B5;
+#else
+	VDPixmapGen_X1R5G5B5_To_R5G6B5 *src = new VDPixmapGen_X1R5G5B5_To_R5G6B5;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_565_to_555() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_R5G6B5_To_X1R5G5B5 *src = MMX_enabled ? new VDPixmapGen_R5G6B5_To_X1R5G5B5_MMX : new VDPixmapGen_R5G6B5_To_X1R5G5B5;
+#else
+	VDPixmapGen_R5G6B5_To_X1R5G5B5 *src = new VDPixmapGen_R5G6B5_To_X1R5G5B5;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_565() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_X8R8G8B8_To_R5G6B5 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_R5G6B5_MMX : new VDPixmapGen_X8R8G8B8_To_R5G6B5;
+#else
+	VDPixmapGen_X8R8G8B8_To_R5G6B5 *src = new VDPixmapGen_X8R8G8B8_To_R5G6B5;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_888() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGen_X8R8G8B8_To_R8G8B8 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_R8G8B8_MMX : new VDPixmapGen_X8R8G8B8_To_R8G8B8;
+#else
+	VDPixmapGen_X8R8G8B8_To_R8G8B8 *src = new VDPixmapGen_X8R8G8B8_To_R8G8B8;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_8() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_32F_To_8 *src = new VDPixmapGen_32F_To_8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_X32F_to_8888() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_X32B32G32R32F_To_X8R8G8B8 *src = new VDPixmapGen_X32B32G32R32F_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_16F() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_32F_To_16F *src = new VDPixmapGen_32F_To_16F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_V210() {
+	StackEntry *args = &*(mStack.end() - 3);
+	VDPixmapGen_32F_To_V210 *src = new VDPixmapGen_32F_To_V210;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::convd_8888_to_555() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_X8R8G8B8_To_X1R5G5B5_Dithered *src = new VDPixmapGen_X8R8G8B8_To_X1R5G5B5_Dithered;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_8888_to_565() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_X8R8G8B8_To_R5G6B5_Dithered *src = new VDPixmapGen_X8R8G8B8_To_R5G6B5_Dithered;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_32F_to_8() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_32F_To_8_Dithered *src = new VDPixmapGen_32F_To_8_Dithered;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_X32F_to_8888() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGen_X32B32G32R32F_To_X8R8G8B8_Dithered *src = new VDPixmapGen_X32B32G32R32F_To_X8R8G8B8_Dithered;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::interleave_B8G8_R8G8() {
+	StackEntry *args = &mStack.back() - 2;
+	VDPixmapGen_B8x3_To_B8G8_R8G8 *src = NULL;
+	
+#if VD_CPU_X86
+	if (MMX_enabled)
+		src = new VDPixmapGen_B8x3_To_B8G8_R8G8_MMX;
+#endif
+
+	if (!src)
+		src = new VDPixmapGen_B8x3_To_B8G8_R8G8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_G8B8_G8R8() {
+	StackEntry *args = &mStack.back() - 2;
+	VDPixmapGen_B8x3_To_G8B8_G8R8 *src = NULL;
+	
+#if VD_CPU_X86
+	if (MMX_enabled)
+		src = new VDPixmapGen_B8x3_To_G8B8_G8R8_MMX;
+#endif
+
+	if (!src)
+		src = new VDPixmapGen_B8x3_To_G8B8_G8R8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_X8R8G8B8() {
+	StackEntry *args = &mStack.back() - 2;
+	VDPixmapGen_B8x3_To_X8R8G8B8 *src = new VDPixmapGen_B8x3_To_X8R8G8B8;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_B8R8() {
+	StackEntry *args = &mStack.back() - 1;
+
+#if VD_CPU_X86
+	VDPixmapGen_B8x2_To_B8R8 *src = MMX_enabled ? new VDPixmapGen_B8x2_To_B8R8_MMX : new VDPixmapGen_B8x2_To_B8R8;
+#else
+	VDPixmapGen_B8x2_To_B8R8 *src = new VDPixmapGen_B8x2_To_B8R8;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_rgb32() {
+	StackEntry *args = &mStack.back() - 2;
+
+#ifdef VD_CPU_X86
+	VDPixmapGenYCbCr601ToRGB32 *src = MMX_enabled ? new VDPixmapGenYCbCr601ToRGB32_MMX : new VDPixmapGenYCbCr601ToRGB32;
+#else
+	VDPixmapGenYCbCr601ToRGB32 *src = new VDPixmapGenYCbCr601ToRGB32;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_rgb32() {
+	StackEntry *args = &mStack.back() - 2;
+
+	VDPixmapGenYCbCr709ToRGB32 *src = new VDPixmapGenYCbCr709ToRGB32;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr601() {
+	StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+	VDPixmapGenRGB32ToYCbCr601 *src = SSE2_enabled ? new VDPixmapGenRGB32ToYCbCr601_SSE2 : new VDPixmapGenRGB32ToYCbCr601;
+#else
+	VDPixmapGenRGB32ToYCbCr601 *src = new VDPixmapGenRGB32ToYCbCr601;
+#endif
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.push_back(StackEntry(src, 1));
+	mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr709() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGenRGB32ToYCbCr709 *src = new VDPixmapGenRGB32ToYCbCr709;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.push_back(StackEntry(src, 1));
+	mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_rgb32_32f() {
+	StackEntry *args = &mStack.back() - 2;
+
+	VDPixmapGenYCbCr601ToRGB32F *src = new VDPixmapGenYCbCr601ToRGB32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_rgb32_32f() {
+	StackEntry *args = &mStack.back() - 2;
+
+	VDPixmapGenYCbCr709ToRGB32F *src = new VDPixmapGenYCbCr709ToRGB32F;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.pop_back();
+	mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr601_32f() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGenRGB32FToYCbCr601 *src = new VDPixmapGenRGB32FToYCbCr601;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.push_back(StackEntry(src, 1));
+	mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr709_32f() {
+	StackEntry *args = &mStack.back();
+	VDPixmapGenRGB32FToYCbCr709 *src = new VDPixmapGenRGB32FToYCbCr709;
+
+	src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	args[0] = StackEntry(src, 0);
+	mStack.push_back(StackEntry(src, 1));
+	mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_ycbcr709() {
+	StackEntry *args = &mStack.back() - 2;
+
+	IVDPixmapGen *src;
+	if ((args[0].mpSrc->GetType(args[0].mSrcIndex) & kVDPixType_Mask) == kVDPixType_32F_LE) {
+		VDPixmapGenYCbCr601ToYCbCr709_32F *src2 = new VDPixmapGenYCbCr601ToYCbCr709_32F;
+
+		src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+		src = src2;
+	} else {
+		VDPixmapGenYCbCr601ToYCbCr709 *src2 = new VDPixmapGenYCbCr601ToYCbCr709;
+
+		src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+		src = src2;
+	}
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	args[1] = StackEntry(src, 1);
+	args[2] = StackEntry(src, 2);
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_ycbcr601() {
+	StackEntry *args = &mStack.back() - 2;
+
+	IVDPixmapGen *src;
+	if ((args[0].mpSrc->GetType(args[0].mSrcIndex) & kVDPixType_Mask) == kVDPixType_32F_LE) {
+		VDPixmapGenYCbCr709ToYCbCr601_32F *src2 = new VDPixmapGenYCbCr709ToYCbCr601_32F;
+
+		src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+		src = src2;
+	} else {
+		VDPixmapGenYCbCr709ToYCbCr601 *src2 = new VDPixmapGenYCbCr709ToYCbCr601;
+
+		src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+		src = src2;
+	}
+
+	mGenerators.push_back(src);
+	MarkDependency(src, args[0].mpSrc);
+	MarkDependency(src, args[1].mpSrc);
+	MarkDependency(src, args[2].mpSrc);
+	args[0] = StackEntry(src, 0);
+	args[1] = StackEntry(src, 1);
+	args[2] = StackEntry(src, 2);
+}
+
+IVDPixmapBlitter *VDPixmapUberBlitterGenerator::create() {
+	vdautoptr<VDPixmapUberBlitter> blitter(new VDPixmapUberBlitter);
+
+	int numStackEntries = (int)mStack.size();
+
+	for(int i=0; i<3; ++i) {
+		if (i < numStackEntries) {
+			blitter->mOutputs[i].mpSrc = mStack[i].mpSrc;
+			blitter->mOutputs[i].mSrcIndex = mStack[i].mSrcIndex;
+		} else {
+			blitter->mOutputs[i].mpSrc = NULL;
+			blitter->mOutputs[i].mSrcIndex = 0;
+		}
+	}
+
+	mStack.clear();
+
+	// If this blitter has three outputs, determine if outputs 1 and 2 are independent
+	// from output 0.
+	blitter->mbIndependentChromaPlanes = true;
+	blitter->mbIndependentPlanes = true;
+	if (numStackEntries >= 3) {
+		int numGens = mGenerators.size();
+		vdfastvector<uint8> genflags(numGens, 0);
+
+		enum {
+			kFlagStateful = 0x80,
+			kFlagY = 0x01,
+			kFlagCb = 0x02,
+			kFlagCr = 0x04,
+			kFlagYCbCr = 0x07
+		};
+
+		for(int i=0; i<3; ++i)
+			genflags[std::find(mGenerators.begin(), mGenerators.end(), blitter->mOutputs[i].mpSrc) - mGenerators.begin()] |= (1 << i);
+
+		for(int i=0; i<numGens; ++i) {
+			IVDPixmapGen *gen = mGenerators[i];
+
+			if (gen->IsStateful())
+				genflags[i] |= kFlagStateful;
+		}
+
+		while(!mDependencies.empty()) {
+			const Dependency& dep = mDependencies.back();
+
+			genflags[dep.mSrcIdx] |= (genflags[dep.mDstIdx] & ~kFlagStateful);
+
+			mDependencies.pop_back();
+		}
+
+		for(int i=0; i<numGens; ++i) {
+			uint8 flags = genflags[i];
+
+			if (!(flags & kFlagStateful))
+				continue;
+
+			switch(flags & kFlagYCbCr) {
+				case 0:
+				case kFlagY:
+				case kFlagCb:
+				case kFlagCr:
+					break;
+				case kFlagCr | kFlagCb:
+					blitter->mbIndependentPlanes = false;
+					break;
+				case kFlagCb | kFlagY:
+				case kFlagCr | kFlagY:
+				case kFlagCr | kFlagCb | kFlagY:
+					blitter->mbIndependentPlanes = false;
+					blitter->mbIndependentChromaPlanes = false;
+					break;
+			}
+		}
+	} else if (numStackEntries >= 2) {
+		int numGens = mGenerators.size();
+		vdfastvector<uint8> genflags(numGens, 0);
+
+		enum {
+			kFlagStateful = 0x80,
+			kFlagY = 0x01,
+			kFlagC = 0x02,
+			kFlagYC = 0x03
+		};
+
+		for(int i=0; i<2; ++i)
+			genflags[std::find(mGenerators.begin(), mGenerators.end(), blitter->mOutputs[i].mpSrc) - mGenerators.begin()] |= (1 << i);
+
+		for(int i=0; i<numGens; ++i) {
+			IVDPixmapGen *gen = mGenerators[i];
+
+			if (gen->IsStateful())
+				genflags[i] |= kFlagStateful;
+		}
+
+		while(!mDependencies.empty()) {
+			const Dependency& dep = mDependencies.back();
+
+			genflags[dep.mSrcIdx] |= (genflags[dep.mDstIdx] & ~kFlagStateful);
+
+			mDependencies.pop_back();
+		}
+
+		for(int i=0; i<numGens; ++i) {
+			uint8 flags = genflags[i];
+
+			if (!(flags & kFlagStateful))
+				continue;
+
+			switch(flags & kFlagYC) {
+				case kFlagYC:
+					blitter->mbIndependentPlanes = false;
+					blitter->mbIndependentChromaPlanes = false;
+					break;
+			}
+		}
+	}
+
+	blitter->mGenerators.swap(mGenerators);
+	blitter->mSources.swap(mSources);
+	return blitter.release();
+}
+
+void VDPixmapUberBlitterGenerator::MarkDependency(IVDPixmapGen *dst, IVDPixmapGen *src) {
+	Generators::const_iterator it1(std::find(mGenerators.begin(), mGenerators.end(), dst));
+	Generators::const_iterator it2(std::find(mGenerators.begin(), mGenerators.end(), src));
+
+	VDASSERT(it1 != mGenerators.end());
+	VDASSERT(it2 != mGenerators.end());
+
+	int idx1 = it1 - mGenerators.begin();
+	int idx2 = it2 - mGenerators.begin();
+
+	Dependency dep = { idx1, idx2 };
+
+	mDependencies.push_back(dep);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp
new file mode 100644
index 000000000..1363fb730
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp
@@ -0,0 +1,623 @@
+#include <float.h>
+#include <math.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/resample.h>
+
+#include <vd2/Kasumi/resample_kernels.h>
+#include "resample_stages_x86.h"
+#include "uberblit_resample.h"
+
+namespace {
+	sint32 scale32x32_fp16(sint32 x, sint32 y) {
+		return (sint32)(((sint64)x * y + 0x8000) >> 16);
+	}
+
+	template<class T>
+	IVDResamplerSeparableRowStage *RowFactory(double cutoff, float filterFactor) {
+		return new T;
+	}
+
+	template<class T>
+	IVDResamplerSeparableRowStage *RowFactoryLinear(double cutoff, float filterFactor) {
+		return new T(VDResamplerLinearFilter(cutoff));
+	}
+
+	template<class T>
+	IVDResamplerSeparableRowStage *RowFactoryCubic(double cutoff, float filterFactor) {
+		return new T(VDResamplerCubicFilter(cutoff, filterFactor));
+	}
+
+	template<class T>
+	IVDResamplerSeparableRowStage *RowFactoryCubic2(double cutoff, float filterFactor) {
+		return new T(filterFactor);
+	}
+
+	template<class T>
+	IVDResamplerSeparableRowStage *RowFactoryLanczos3(double cutoff, float filterFactor) {
+		return new T(VDResamplerLanczos3Filter(cutoff));
+	}
+
+	template<class T>
+	IVDResamplerSeparableColStage *ColFactory(double cutoff, float filterFactor) {
+		return new T;
+	}
+
+	template<class T>
+	IVDResamplerSeparableColStage *ColFactoryLinear(double cutoff, float filterFactor) {
+		return new T(VDResamplerLinearFilter(cutoff));
+	}
+
+	template<class T>
+	IVDResamplerSeparableColStage *ColFactoryCubic(double cutoff, float filterFactor) {
+		return new T(VDResamplerCubicFilter(cutoff, filterFactor));
+	}
+
+	template<class T>
+	IVDResamplerSeparableColStage *ColFactoryCubic2(double cutoff, float filterFactor) {
+		return new T(filterFactor);
+	}
+
+	template<class T>
+	IVDResamplerSeparableColStage *ColFactoryLanczos3(double cutoff, float filterFactor) {
+		return new T(VDResamplerLanczos3Filter(cutoff));
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDPixmapGenResampleRow
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapGenResampleRow::VDPixmapGenResampleRow()
+	: mpRowStage(NULL)
+	, mpRowStage2(NULL)
+{
+}
+
+VDPixmapGenResampleRow::~VDPixmapGenResampleRow() {
+	if (mpRowStage)
+		delete mpRowStage;
+}
+
+void VDPixmapGenResampleRow::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 width, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
+	InitSource(src, srcIndex);
+
+	sint32 u0 = (sint32)(offset * 65536.0);
+	sint32 dudx = (sint32)(step * 65536.0);
+
+	mAxis.Init(dudx);
+
+	double x_2fc = 1.0;
+	if (!interpolationOnly && step > 1.0f)
+		x_2fc = 1.0 / step;
+
+	struct SpecialCaseSpanRoutine {
+		sint32		mPhase;
+		sint32		mStep;
+		uint32		mType;
+		nsVDPixmap::FilterMode mFilterMode;
+		uint32 mCPUFlags;
+		IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+	};
+
+	static const SpecialCaseSpanRoutine kSpecialCaseSpanRoutines[]={
+		// Generic
+#if defined _M_IX86
+		{ +0x0000, 0x008000, kVDPixType_8,		nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_INTEGER_SSE,	RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf_ISSE> },
+#endif
+
+		{ +0x0000, 0x008000, kVDPixType_8,		nsVDPixmap::kFilterLinear,		0,							RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf> },
+	};
+
+	long flags = CPUGetEnabledExtensions();
+	uint32 type = mpSrc->GetType(mSrcIndex) & kVDPixType_Mask;
+
+	for(int i=0; i<sizeof(kSpecialCaseSpanRoutines)/sizeof(kSpecialCaseSpanRoutines[0]); ++i) {
+		const SpecialCaseSpanRoutine& rout = kSpecialCaseSpanRoutines[i];
+
+		if (rout.mType != type)
+			continue;
+
+		if (x_2fc < 1.0)
+			continue;
+
+		if (rout.mStep != dudx)
+			continue;
+
+		if (rout.mPhase != u0)
+			continue;
+
+		if (rout.mFilterMode != filterMode)
+			continue;
+
+		if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+			continue;
+
+		mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
+		mpRowStage2 = mpRowStage->AsRowStage2();
+		break;
+	}
+
+	if (!mpRowStage) {
+		struct SpanRoutine {
+			uint32		mType;
+			bool mbInterpOnly;
+			nsVDPixmap::FilterMode mFilterMode;
+			uint32 mCPUFlags;
+			IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+		};
+		
+		static const SpanRoutine kSpanRoutines[]={
+#if defined _M_IX86
+			// X86
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterPoint,		CPUF_SUPPORTS_MMX,	RowFactory<VDResamplerSeparablePointRowStageMMX> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterPoint,		0,					RowFactory<VDResamplerSeparablePointRowStageX86> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE41,	RowFactoryLinear<VDResamplerSeparableTableRowStage8SSE41> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	RowFactoryLinear<VDResamplerSeparableTableRowStage8MMX> },
+			{ kVDPixType_8888,		true,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	RowFactory<VDResamplerSeparableLinearRowStageMMX> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE2,	RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	RowFactoryLinear<VDResamplerSeparableTableRowStageMMX> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE41,	RowFactoryCubic<VDResamplerSeparableTableRowStage8SSE41> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	RowFactoryCubic<VDResamplerSeparableTableRowStage8MMX> },
+			{ kVDPixType_8888,		true,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	RowFactoryCubic2<VDResamplerSeparableCubicRowStageMMX> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE2,	RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	RowFactoryCubic<VDResamplerSeparableTableRowStageMMX> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,		CPUF_SUPPORTS_SSE41,	RowFactoryLanczos3<VDResamplerSeparableTableRowStage8SSE41> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_MMX,	RowFactoryLanczos3<VDResamplerSeparableTableRowStage8MMX> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_SSE2,	RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_MMX,	RowFactoryLanczos3<VDResamplerSeparableTableRowStageMMX> },
+#elif defined _M_AMD64
+			// AMD64
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE2,	RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE2,	RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_SSE2,	RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
+#endif
+			// Generic
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterPoint,		0,					RowFactory<VDResamplerRowStageSeparablePoint8> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterPoint,		0,					RowFactory<VDResamplerRowStageSeparablePoint32> },
+			{ kVDPixType_8,			true,	nsVDPixmap::kFilterLinear,		0,					RowFactory<VDResamplerRowStageSeparableLinear8> },
+			{ kVDPixType_8888,		true,	nsVDPixmap::kFilterLinear,		0,					RowFactory<VDResamplerRowStageSeparableLinear32> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		0,					RowFactoryLinear<VDResamplerRowStageSeparableTable8> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		0,					RowFactoryLinear<VDResamplerRowStageSeparableTable32> },
+			{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterLinear,		0,					RowFactoryLinear<VDResamplerRowStageSeparableTable32F> },
+			{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterLinear,		0,					RowFactoryLinear<VDResamplerRowStageSeparableTable32Fx4> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		0,					RowFactoryCubic<VDResamplerRowStageSeparableTable8> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		0,					RowFactoryCubic<VDResamplerRowStageSeparableTable32> },
+			{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterCubic,		0,					RowFactoryCubic<VDResamplerRowStageSeparableTable32F> },
+			{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterCubic,		0,					RowFactoryCubic<VDResamplerRowStageSeparableTable32Fx4> },
+			{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,	0,					RowFactoryLanczos3<VDResamplerRowStageSeparableTable8> },
+			{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	0,					RowFactoryLanczos3<VDResamplerRowStageSeparableTable32> },
+			{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterLanczos3,	0,					RowFactoryLanczos3<VDResamplerRowStageSeparableTable32F> },
+			{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterLanczos3,	0,					RowFactoryLanczos3<VDResamplerRowStageSeparableTable32Fx4> },
+		};
+
+		for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
+			const SpanRoutine& rout = kSpanRoutines[i];
+
+			if (rout.mType != type)
+				continue;
+
+			if (rout.mbInterpOnly && x_2fc < 1.0)
+				continue;
+
+			if (rout.mFilterMode != filterMode)
+				continue;
+
+			if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+				continue;
+
+			mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
+			mpRowStage2 = mpRowStage->AsRowStage2();
+			break;
+		}
+	}
+
+	VDASSERT(mpRowStage);
+
+	mRowFiltW = mpRowStage->GetWindowSize();
+
+	mpSrc->AddWindowRequest(0, 0);
+
+	sint32 fsx1 = (sint32)(offset * 65536.0) - ((mRowFiltW-1) << 15);
+	mAxis.Compute(width, fsx1, mSrcWidth, mRowFiltW);
+	mWidth = width;
+
+	switch(type) {
+		case kVDPixType_8:
+			mBytesPerSample = 1;
+			break;
+		case kVDPixType_8888:
+		case kVDPixType_32F_LE:
+			mBytesPerSample = 4;
+			break;
+		case kVDPixType_32Fx4_LE:
+			mBytesPerSample = 16;
+			break;
+
+		default:
+			VDASSERT(false);
+	}
+}
+
+void VDPixmapGenResampleRow::Start() {
+	StartWindow(mWidth * mBytesPerSample);
+
+	uint32 clipSpace = ((mRowFiltW*3*mBytesPerSample + 15) >> 4) << 2;
+	mTempSpace.resize(clipSpace);
+
+	if (mpRowStage2)
+		mpRowStage2->Init(mAxis, mSrcWidth);
+}
+
+void VDPixmapGenResampleRow::Compute(void *dst0, sint32 y) {
+	switch(mBytesPerSample) {
+		case 1:
+			Compute8(dst0, y);
+			break;
+		case 4:
+			Compute32(dst0, y);
+			break;
+		case 16:
+			Compute128(dst0, y);
+			break;
+	}
+}
+
+void VDPixmapGenResampleRow::Compute8(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+	uint8 *dst = (uint8 *)dst0;
+
+	// process pre-copy region
+	if (uint32 count = mAxis.dx_precopy) {
+		VDMemset8(dst, src[0], count);
+		dst += count;
+	}
+
+	uint8 *p = (uint8*)mTempSpace.data();
+	sint32 u = mAxis.u;
+	const sint32 dudx = mAxis.dudx;
+
+	// process dual-clip region
+	if (mpRowStage2) {
+		uint32 count = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
+		mpRowStage2->Process(dst, src, count);
+		dst += count;
+	} else if (uint32 count = mAxis.dx_dualclip) {
+		VDMemset8(p, src[0], mRowFiltW);
+		memcpy(p + mRowFiltW, src+1, (mSrcWidth-2));
+		VDMemset8(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
+
+		mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+		u += dudx*count;
+		dst += count;
+	} else {
+		// process pre-clip region
+		if (uint32 count = mAxis.dx_preclip) {
+			VDMemset8(p, src[0], mRowFiltW);
+			memcpy(p + mRowFiltW, src+1, (mRowFiltW-1));
+
+			mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+			u += dudx*count;
+			dst += count;
+		}
+
+		// process active region
+		if (uint32 count = mAxis.dx_active) {
+			mpRowStage->Process(dst, src, count, u, dudx);
+			u += dudx*count;
+			dst += count;
+		}
+
+		// process post-clip region
+		if (uint32 count = mAxis.dx_postclip) {
+			uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+			memcpy(p, src+offset, (mRowFiltW-1));
+			VDMemset8(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
+
+			mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+			dst += count;
+		}
+	}
+
+	// process post-copy region
+	if (uint32 count = mAxis.dx_postcopy) {
+		VDMemset8(dst, src[mSrcWidth-1], count);
+	}
+}
+
+void VDPixmapGenResampleRow::Compute32(void *dst0, sint32 y) {
+	const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+	uint32 *dst = (uint32 *)dst0;
+
+	// process pre-copy region
+	if (uint32 count = mAxis.dx_precopy) {
+		VDMemset32(dst, src[0], count);
+		dst += count;
+	}
+
+	uint32 *p = mTempSpace.data();
+	sint32 u = mAxis.u;
+	const sint32 dudx = mAxis.dudx;
+
+	// process dual-clip region
+	if (uint32 count = mAxis.dx_dualclip) {
+		VDMemset32(p, src[0], mRowFiltW);
+		memcpy(p + mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32));
+		VDMemset32(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
+
+		mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+		u += dudx*count;
+		dst += count;
+	} else if (mpRowStage2) {
+		mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
+	} else {
+		// process pre-clip region
+		if (uint32 count = mAxis.dx_preclip) {
+			VDMemset32(p, src[0], mRowFiltW);
+			memcpy(p + mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32));
+
+			mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+			u += dudx*count;
+			dst += count;
+		}
+
+		// process active region
+		if (uint32 count = mAxis.dx_active) {
+			mpRowStage->Process(dst, src, count, u, dudx);
+			u += dudx*count;
+			dst += count;
+		}
+
+		// process post-clip region
+		if (uint32 count = mAxis.dx_postclip) {
+			uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+			memcpy(p, src+offset, (mRowFiltW-1)*sizeof(uint32));
+			VDMemset32(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
+
+			mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+			dst += count;
+		}
+	}
+
+	// process post-copy region
+	if (uint32 count = mAxis.dx_postcopy) {
+		VDMemset32(dst, src[mSrcWidth-1], count);
+	}
+}
+
+void VDPixmapGenResampleRow::Compute128(void *dst0, sint32 y) {
+	const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+	uint32 *dst = (uint32 *)dst0;
+
+	// process pre-copy region
+	if (uint32 count = mAxis.dx_precopy) {
+		VDMemset128(dst, src, count);
+		dst += 4*count;
+	}
+
+	uint32 *p = mTempSpace.data();
+	sint32 u = mAxis.u;
+	const sint32 dudx = mAxis.dudx;
+
+	// process dual-clip region
+	if (uint32 count = mAxis.dx_dualclip) {
+		VDMemset128(p, src, mRowFiltW);
+		memcpy(p + 4*mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32)*4);
+		VDMemset128(p + 4*(mRowFiltW + (mSrcWidth-2)), src + 4*(mSrcWidth-1), mRowFiltW);
+
+		mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+		u += dudx*count;
+		dst += count * 4;
+	} else if (mpRowStage2) {
+		mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
+	} else {
+		// process pre-clip region
+		if (uint32 count = mAxis.dx_preclip) {
+			VDMemset128(p, src, mRowFiltW);
+			memcpy(p + 4*mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32)*4);
+
+			mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+			u += dudx*count;
+			dst += count*4;
+		}
+
+		// process active region
+		if (uint32 count = mAxis.dx_active) {
+			mpRowStage->Process(dst, src, count, u, dudx);
+			u += dudx*count;
+			dst += count*4;
+		}
+
+		// process post-clip region
+		if (uint32 count = mAxis.dx_postclip) {
+			uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+			memcpy(p, src+offset*4, (mRowFiltW-1)*sizeof(uint32)*4);
+			VDMemset128(p + 4*(mRowFiltW-1), src + 4*(mSrcWidth-1), mRowFiltW);
+
+			mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+			dst += count*4;
+		}
+	}
+
+	// process post-copy region
+	if (uint32 count = mAxis.dx_postcopy) {
+		VDMemset128(dst, src + 4*(mSrcWidth-1), count);
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDPixmapGenResampleCol
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapGenResampleCol::VDPixmapGenResampleCol()
+	: mpColStage(NULL)
+{
+}
+
+VDPixmapGenResampleCol::~VDPixmapGenResampleCol() {
+	if (mpColStage)
+		delete mpColStage;
+}
+
+void VDPixmapGenResampleCol::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 height, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
+	InitSource(src, srcIndex);
+
+	sint32 dvdy = (sint32)(step * 65536.0);
+
+	mAxis.Init(dvdy);
+
+	// construct stages
+	double y_2fc = 1.0;
+	if (!interpolationOnly && step > 1.0f)
+		y_2fc = 1.0 / step;
+
+	struct SpanRoutine {
+		uint32 mType;
+		bool mbInterpOnly;
+		nsVDPixmap::FilterMode mFilterMode;
+		uint32 mCPUFlags;
+		IVDResamplerSeparableColStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+	};
+	
+	static const SpanRoutine kSpanRoutines[]={
+#if defined _M_IX86
+		// X86
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE41,	ColFactoryLinear<VDResamplerSeparableTableColStage8SSE41> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	ColFactoryLinear<VDResamplerSeparableTableColStage8MMX> },
+		{ kVDPixType_8888,		true,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	ColFactory<VDResamplerSeparableLinearColStageMMX> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE2,	ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_MMX,	ColFactoryLinear<VDResamplerSeparableTableColStageMMX> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE41,	ColFactoryCubic<VDResamplerSeparableTableColStage8SSE41> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	ColFactoryCubic<VDResamplerSeparableTableColStage8MMX> },
+		{ kVDPixType_8888,		true,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE2,	ColFactoryCubic2<VDResamplerSeparableCubicColStageSSE2> },
+		{ kVDPixType_8888,		true,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	ColFactoryCubic2<VDResamplerSeparableCubicColStageMMX> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE2,	ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_MMX,	ColFactoryCubic<VDResamplerSeparableTableColStageMMX> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_SSE41,	ColFactoryLanczos3<VDResamplerSeparableTableColStage8SSE41> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_MMX,	ColFactoryLanczos3<VDResamplerSeparableTableColStage8MMX> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_SSE2,	ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_MMX,	ColFactoryLanczos3<VDResamplerSeparableTableColStageMMX> },
+#elif defined _M_AMD64
+		// AMD64
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		CPUF_SUPPORTS_SSE2,	ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		CPUF_SUPPORTS_SSE2,	ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	CPUF_SUPPORTS_SSE2,	ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
+#endif
+		// Generic
+		{ kVDPixType_8,			true,	nsVDPixmap::kFilterLinear,		0,					ColFactory<VDResamplerColStageSeparableLinear8> },
+		{ kVDPixType_8888,		true,	nsVDPixmap::kFilterLinear,		0,					ColFactory<VDResamplerColStageSeparableLinear32> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLinear,		0,					ColFactoryLinear<VDResamplerColStageSeparableTable8> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLinear,		0,					ColFactoryLinear<VDResamplerColStageSeparableTable32> },
+		{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterLinear,		0,					ColFactoryLinear<VDResamplerColStageSeparableTable32F> },
+		{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterLinear,		0,					ColFactoryLinear<VDResamplerColStageSeparableTable32Fx4> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterCubic,		0,					ColFactoryCubic<VDResamplerColStageSeparableTable8> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterCubic,		0,					ColFactoryCubic<VDResamplerColStageSeparableTable32> },
+		{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterCubic,		0,					ColFactoryCubic<VDResamplerColStageSeparableTable32F> },
+		{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterCubic,		0,					ColFactoryCubic<VDResamplerColStageSeparableTable32Fx4> },
+		{ kVDPixType_8,			false,	nsVDPixmap::kFilterLanczos3,	0,					ColFactoryLanczos3<VDResamplerColStageSeparableTable8> },
+		{ kVDPixType_8888,		false,	nsVDPixmap::kFilterLanczos3,	0,					ColFactoryLanczos3<VDResamplerColStageSeparableTable32> },
+		{ kVDPixType_32F_LE,	false,	nsVDPixmap::kFilterLanczos3,	0,					ColFactoryLanczos3<VDResamplerColStageSeparableTable32F> },
+		{ kVDPixType_32Fx4_LE,	false,	nsVDPixmap::kFilterLanczos3,	0,					ColFactoryLanczos3<VDResamplerColStageSeparableTable32Fx4> },
+	};
+
+	long flags = CPUGetEnabledExtensions();
+	uint32 type = src->GetType(srcIndex) & kVDPixType_Mask;
+	for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
+		const SpanRoutine& rout = kSpanRoutines[i];
+
+		if (rout.mType != type)
+			continue;
+
+		if (rout.mbInterpOnly && y_2fc < 1.0)
+			continue;
+
+		if (rout.mFilterMode != filterMode)
+			continue;
+
+		if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+			continue;
+
+		mpColStage = rout.mpClassFactory(y_2fc, filterFactor);
+		break;
+	}
+
+	mWinSize = mpColStage ? mpColStage->GetWindowSize() : 1;
+	mWindow.resize(mWinSize);
+
+	int delta = (mWinSize + 1) >> 1;
+	mpSrc->AddWindowRequest(-delta, delta);
+
+	sint32 fsy1 = (sint32)(offset * 65536.0) - ((mWinSize-1)<<15);
+	mAxis.Compute(height, fsy1, mSrcHeight, mWinSize);
+	mHeight = height;
+
+	switch(type) {
+		case kVDPixType_8:
+			mBytesPerSample = 1;
+			break;
+		case kVDPixType_8888:
+		case kVDPixType_32F_LE:
+			mBytesPerSample = 4;
+			break;
+		case kVDPixType_32Fx4_LE:
+			mBytesPerSample = 16;
+			break;
+
+		default:
+			VDASSERT(false);
+	}
+}
+
+void VDPixmapGenResampleCol::Start() {
+	mBytesPerRow = mWidth * mBytesPerSample;
+	StartWindow(mBytesPerRow);
+}
+
+void VDPixmapGenResampleCol::Compute(void *dst0, sint32 y) {
+	const uint32 winsize = mWinSize;
+	const uint32 dx = mSrcWidth;
+
+	y -= (sint32)mAxis.dx_precopy;
+
+	if (y < 0) {
+		const void *srcrow0 = mpSrc->GetRow(0, mSrcIndex);
+		memcpy(dst0, srcrow0, mBytesPerRow);
+		return;
+	}
+
+	uint32 midrange = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
+
+	if (y < (sint32)midrange) {
+		sint32 v = mAxis.u + mAxis.dudx * y;
+
+		if (mpColStage) {
+			for(uint32 i=0; i<winsize; ++i) {
+				int sy = (v >> 16) + i;
+
+				if ((unsigned)sy >= (unsigned)mSrcHeight)
+					sy = (~sy >> 31) & (mSrcHeight - 1);
+
+				mWindow[i] = mpSrc->GetRow(sy, mSrcIndex);
+			}
+
+			mpColStage->Process(dst0, mWindow.data(), dx, v);
+		} else
+			memcpy(dst0, mpSrc->GetRow(v >> 16, mSrcIndex), mBytesPerRow);
+		return;
+	}
+
+	const void *p = mpSrc->GetRow(mSrcHeight - 1, mSrcIndex);
+
+	memcpy(dst0, p, mBytesPerRow);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp
new file mode 100644
index 000000000..0c649dd5c
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp
@@ -0,0 +1,186 @@
+#include "uberblit_resample_special.h"
+#include "blt_spanutils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(0, 0);
+
+	mWidth = (mSrcWidth + 1) >> 1;
+}
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_compress2x_coaligned((uint8 *)dst0, src, mSrcWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(0, 0);
+
+	mWidth = (mSrcWidth + 3) >> 2;
+}
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_compress4x_coaligned((uint8 *)dst0, src, mSrcWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(0, 0);
+
+	mWidth = mSrcWidth * 2;
+}
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_expand2x_coaligned((uint8 *)dst0, src, mWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(0, 0);
+
+	mWidth = mSrcWidth * 4;
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_expand4x_coaligned((uint8 *)dst0, src, mWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(-2, 2);
+
+	mHeight = (mSrcHeight + 1) >> 1;
+}
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Compute(void *dst0, sint32 y) {
+	sint32 y2 = y+y;
+	const uint8 *src[4] = {
+		(const uint8 *)mpSrc->GetRow(y2 > 0 ? y2-1 : 0, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2  , mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+2, mSrcIndex)
+	};
+
+	nsVDPixmapSpanUtils::vert_compress2x_centered((uint8 *)dst0, src, mWidth, 0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(-4, 4);
+
+	mHeight = (mSrcHeight + 2) >> 2;
+}
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Compute(void *dst0, sint32 y) {
+	sint32 y4 = y*4;
+	const uint8 *src[8] = {
+		(const uint8 *)mpSrc->GetRow(y4 > 2 ? y4-2 : 0, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4 > 1 ? y4-1 : 0, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4  , mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4+1, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4+2, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4+3, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4+4, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y4+5, mSrcIndex)
+	};
+
+	nsVDPixmapSpanUtils::vert_compress4x_centered((uint8 *)dst0, src, mWidth, 0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(-1, 1);
+
+	mHeight = mSrcHeight * 2;
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Compute(void *dst0, sint32 y) {
+	sint32 y2 = (y - 1) >> 1;
+	const uint8 *src[2] = {
+		(const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+	};
+
+	nsVDPixmapSpanUtils::vert_expand2x_centered((uint8 *)dst0, src, mWidth, ~y << 7);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+	InitSource(src, srcIndex);
+	src->AddWindowRequest(-1, 1);
+
+	mHeight = mSrcHeight * 4;
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Start() {
+	mpSrc->Start();
+	StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Compute(void *dst0, sint32 y) {
+	sint32 y2 = (y - 2) >> 2;
+	const uint8 *src[2] = {
+		(const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+	};
+
+	nsVDPixmapSpanUtils::vert_expand4x_centered((uint8 *)dst0, src, mWidth, (y - 2) << 6);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp
new file mode 100644
index 000000000..b1828fcca
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp
@@ -0,0 +1,35 @@
+#include "uberblit_resample_special_x86.h"
+#include "blt_spanutils.h"
+#include "blt_spanutils_x86.h"
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_expand2x_coaligned_ISSE((uint8 *)dst0, src, mWidth);
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8_MMX::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	nsVDPixmapSpanUtils::horiz_expand4x_coaligned_MMX((uint8 *)dst0, src, mWidth);
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+	sint32 y2 = (y - 1) >> 1;
+	const uint8 *src[2] = {
+		(const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+	};
+
+	nsVDPixmapSpanUtils::vert_expand2x_centered_ISSE((uint8 *)dst0, src, mWidth, ~y << 7);
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+	sint32 y2 = (y - 2) >> 2;
+	const uint8 *src[2] = {
+		(const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+		(const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+	};
+
+	nsVDPixmapSpanUtils::vert_expand4x_centered_ISSE((uint8 *)dst0, src, mWidth, (y - 2) << 6);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp
new file mode 100644
index 000000000..4cb5e4409
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp
@@ -0,0 +1,89 @@
+#include "uberblit_swizzle.h"
+
+void VDPixmapGen_Swap8In16::Init(IVDPixmapGen *gen, int srcIndex, uint32 w, uint32 h, uint32 bpr) {
+	InitSource(gen, srcIndex);
+	mRowLength = bpr;
+	SetOutputSize(w, h);
+	gen->AddWindowRequest(0, 0);
+}
+
+void VDPixmapGen_Swap8In16::Start() {
+	StartWindow(mRowLength);
+}
+
+uint32 VDPixmapGen_Swap8In16::GetType(uint32 index) const {
+	return mpSrc->GetType(mSrcIndex);
+}
+
+void VDPixmapGen_Swap8In16::Compute(void *dst0, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+	uint8 *dst = (uint8 *)dst0;
+	sint32 w = mRowLength;
+
+	uint32 n4 = w >> 2;
+
+	for(uint32 i=0; i<n4; ++i) {
+		uint32 p = *(uint32 *)src;
+		src += 4;
+
+		uint32 r = ((p & 0xff00ff00) >> 8) + ((p & 0x00ff00ff) << 8);
+
+		*(uint32 *)dst = r;
+		dst += 4;
+	}
+
+	if (w & 2) {
+		dst[0] = src[1];
+		dst[1] = src[0];
+		dst += 2;
+		src += 2;
+	}
+
+	if (w & 1) {
+		*dst = *src;
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_B8x2_To_B8R8::Init(IVDPixmapGen *srcCb, uint32 srcindexCb, IVDPixmapGen *srcCr, uint32 srcindexCr) {
+	mpSrcCb = srcCb;
+	mSrcIndexCb = srcindexCb;
+	mpSrcCr = srcCr;
+	mSrcIndexCr = srcindexCr;
+	mWidth = srcCb->GetWidth(srcindexCb);
+	mHeight = srcCb->GetHeight(srcindexCb);
+
+	srcCb->AddWindowRequest(0, 0);
+	srcCr->AddWindowRequest(0, 0);
+}
+
+void VDPixmapGen_B8x2_To_B8R8::Start() {
+	mpSrcCb->Start();
+	mpSrcCr->Start();
+
+	StartWindow(mWidth * 2);
+}
+
+uint32 VDPixmapGen_B8x2_To_B8R8::GetType(uint32 output) const {
+	return (mpSrcCb->GetType(mSrcIndexCb) & ~kVDPixType_Mask) | kVDPixType_B8R8;
+}
+
+void VDPixmapGen_B8x2_To_B8R8::Compute(void *dst0, sint32 y) {
+	uint8 *VDRESTRICT dst = (uint8 *)dst0;
+	const uint8 *VDRESTRICT srcCb = (const uint8 *)mpSrcCb->GetRow(y, mSrcIndexCb);
+	const uint8 *VDRESTRICT srcCr = (const uint8 *)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+	sint32 w = mWidth;
+	for(sint32 x=0; x<w; ++x) {
+		uint8 cb = srcCb[0];
+		uint8 cr = srcCr[0];
+
+		dst[0] = cb;
+		dst[1] = cr;
+
+		++srcCb;
+		++srcCr;
+		dst += 2;
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp
new file mode 100644
index 000000000..3a87d5a68
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp
@@ -0,0 +1,400 @@
+#include "uberblit_swizzle_x86.h"
+
+#ifdef VD_COMPILER_MSVC
+	#pragma warning(disable: 4799)	// warning C4799: function 'vdasm_extract_8in16_even_MMX' has no EMMS instruction
+#endif
+
+void __declspec(naked) __fastcall vdasm_extract_8in16_even_MMX(void *dst, const void *src, uint32 count) {
+	__asm {
+		mov			eax, [esp+4]
+		pcmpeqb		mm2, mm2
+		psrlw		mm2, 8
+		sub			eax, 8
+		jc			xtra
+xloop:
+		movq		mm0, [edx]
+		movq		mm1, [edx+8]
+		pand		mm0, mm2
+		pand		mm1, mm2
+		packuswb	mm0, mm1
+		add			edx, 16
+		movq		[ecx], mm0
+		add			ecx, 8
+		sub			eax, 8
+		jns			xloop
+xtra:
+		add			eax, 8
+		jz			fin
+		push		ebx
+xtraloop:
+		mov			bl, [edx]
+		add			edx, 2
+		mov			[ecx], bl
+		add			ecx, 1
+		sub			eax, 1
+		jnz			xtraloop
+
+		pop			ebx
+fin:
+		ret			4
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_extract_8in16_odd_MMX(void *dst, const void *src, uint32 count) {
+	__asm {
+		mov			eax, [esp+4]
+		sub			eax, 8
+		jc			xtra
+xloop:
+		movq		mm0, [edx]
+		movq		mm1, [edx+8]
+		psrlw		mm0, 8
+		psrlw		mm1, 8
+		add			edx, 16
+		packuswb	mm0, mm1
+		movq		[ecx], mm0
+		add			ecx, 8
+		sub			eax, 8
+		jns			xloop
+xtra:
+		add			eax, 8
+		jz			fin
+		push		ebx
+xtraloop:
+		mov			bl, [edx+1]
+		add			edx, 2
+		mov			[ecx], bl
+		add			ecx, 1
+		sub			eax, 1
+		jnz			xtraloop
+
+		pop			ebx
+fin:
+		ret			4
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_extract_8in32_MMX(void *dst, const void *src, uint32 count, int byteshift) {
+	__asm {
+		movd		mm4, [esp+8]
+		pcmpeqb		mm5, mm5
+		pslld		mm4, 3
+		mov			eax, [esp+4]
+		psrld		mm5, 24
+		sub			eax, 8
+		jc			xtra
+xloop:
+		movq		mm0, [edx]
+		movq		mm1, [edx+8]
+		psrld		mm0, mm4
+		movq		mm2, [edx+16]
+		psrld		mm1, mm4
+		pand		mm0, mm5
+		movq		mm3, [edx+24]
+		psrld		mm2, mm4
+		pand		mm1, mm5
+		packssdw	mm0, mm1
+		psrld		mm3, mm4
+		pand		mm2, mm5
+		pand		mm3, mm5
+		add			edx, 32
+		packssdw	mm2, mm3
+		packuswb	mm0, mm2
+		movq		[ecx], mm0
+		add			ecx, 8
+		sub			eax, 8
+		jns			xloop
+xtra:
+		add			eax, 8
+		jz			fin
+		add			edx, dword ptr [esp+8]
+		push		ebx
+xtraloop:
+		mov			bl, [edx]
+		add			edx, 4
+		mov			[ecx], bl
+		add			ecx, 1
+		sub			eax, 1
+		jnz			xtraloop
+
+		pop			ebx
+fin:
+		ret			8
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_swap_8in16_MMX(void *dst, const void *src, uint32 count) {
+	__asm {
+		mov			eax, [esp+4]
+		sub			eax, 8
+		js			xtra
+xloop:
+		movq		mm0, [edx]
+		add			edx, 8
+		movq		mm1, mm0
+		psllw		mm0, 8
+		psrlw		mm1, 8
+		paddb		mm0, mm1
+		movq		[ecx], mm0
+		add			ecx, 8
+		sub			eax, 8
+		jns			xloop
+xtra:
+		add			eax, 6
+		js			nopairs
+		push		ebx
+pairloop:
+		mov			bl, [edx]
+		mov			bh, [edx+1]
+		add			edx, 2
+		mov			[ecx], bh
+		mov			[ecx+1], bl
+		add			ecx, 2
+		sub			eax, 2
+		jns			pairloop
+		pop			ebx
+nopairs:
+		add			eax, 2
+		jz			noodd
+		mov			al, [edx]
+		mov			[ecx], al
+noodd:
+		ret			4
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_BGRG_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
+	__asm {
+		push		edi
+		push		esi
+		push		ebx
+		mov			esi, [esp+12+12]
+		mov			edi, [esp+8+12]
+		mov			ebx, [esp+4+12]
+		sub			esi, 4
+		jc			xtra
+		; ecx = dst
+		; edx = srcR
+		; ebx = srcG
+		; edi = srcB
+xloop:
+		movd		mm0, [edi]
+		movd		mm1, [edx]
+		punpcklbw	mm0, mm1
+		movq		mm1, [ebx]
+		movq		mm2, mm0
+		punpcklbw	mm0, mm1
+		add			edx, 4
+		punpckhbw	mm2, mm1
+		add			edi, 4
+		movq		[ecx], mm0
+		add			ebx, 8
+		movq		[ecx+8], mm2
+		add			ecx, 16
+		sub			esi, 4
+		jns			xloop
+xtra:
+		add			esi, 4
+		jz			fin
+xtraloop:
+		mov			al, [edi]
+		mov			[ecx], al
+		mov			al, [ebx]
+		mov			[ecx+1], al
+		mov			al, [edx]
+		mov			[ecx+2], al
+		mov			al, [ebx+1]
+		mov			[ecx+3], al
+		add			ebx, 2
+		add			edx, 1
+		add			edi, 1
+		add			ecx, 4
+		sub			esi, 1
+		jnz			xtraloop
+fin:
+		pop			ebx
+		pop			esi
+		pop			edi
+		ret			12
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_GBGR_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
+	__asm {
+		push		edi
+		push		esi
+		push		ebx
+		mov			esi, [esp+12+12]
+		mov			edi, [esp+8+12]
+		mov			ebx, [esp+4+12]
+		sub			esi, 4
+		jc			xtra
+		; ecx = dst
+		; edx = srcR
+		; ebx = srcG
+		; edi = srcB
+xloop:
+		movd		mm0, [edi]
+		movd		mm1, [edx]
+		punpcklbw	mm0, mm1
+		movq		mm2, [ebx]
+		movq		mm1, mm2
+		punpcklbw	mm2, mm0
+		add			edx, 4
+		punpckhbw	mm1, mm0
+		add			edi, 4
+		movq		[ecx], mm2
+		add			ebx, 8
+		movq		[ecx+8], mm1
+		add			ecx, 16
+		sub			esi, 4
+		jns			xloop
+xtra:
+		add			esi, 4
+		jz			fin
+xtraloop:
+		mov			al, [ebx]
+		mov			[ecx], al
+		mov			al, [edi]
+		mov			[ecx+1], al
+		mov			al, [ebx+1]
+		mov			[ecx+2], al
+		mov			al, [edx]
+		mov			[ecx+3], al
+		add			ebx, 2
+		add			edx, 1
+		add			edi, 1
+		add			ecx, 4
+		sub			esi, 1
+		jnz			xtraloop
+fin:
+		pop			ebx
+		pop			esi
+		pop			edi
+		ret			12
+	}
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_BR_MMX(void *dst, const void *srcB, const void *srcR, uint32 count) {
+	__asm {
+		push		edi
+		push		esi
+		push		ebx
+		mov			esi, [esp+8+12]
+		mov			ebx, [esp+4+12]
+		sub			esi, 8
+		jc			xtra
+		; ecx = dst
+		; edx = srcB
+		; ebx = srcG
+xloop:
+		movq		mm0, [edx]
+		movq		mm1, [ebx]
+		movq		mm2, mm0
+		punpcklbw	mm0, mm1
+		punpckhbw	mm2, mm1
+		add			edx, 8
+		movq		[ecx], mm0
+		add			ebx, 8
+		movq		[ecx+8], mm2
+		add			ecx, 16
+		sub			esi, 8
+		jns			xloop
+xtra:
+		add			esi, 8
+		jz			fin
+xtraloop:
+		mov			al, [edx]
+		mov			[ecx], al
+		mov			al, [ebx]
+		mov			[ecx+1], al
+		add			ebx, 1
+		add			edx, 1
+		add			ecx, 2
+		sub			esi, 1
+		jnz			xtraloop
+fin:
+		pop			ebx
+		pop			esi
+		pop			edi
+		ret			8
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_8In16_Even_MMX::Compute(void *dst, sint32 y) {
+	const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	vdasm_extract_8in16_even_MMX(dst, srcp, mWidth);
+}
+
+void VDPixmapGen_8In16_Odd_MMX::Compute(void *dst, sint32 y) {
+	const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	vdasm_extract_8in16_odd_MMX(dst, srcp, mWidth);
+}
+
+void VDPixmapGen_8In32_MMX::Compute(void *dst, sint32 y) {
+	const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	vdasm_extract_8in32_MMX(dst, srcp, mWidth, mOffset);
+}
+
+void VDPixmapGen_Swap8In16_MMX::Compute(void *dst, sint32 y) {
+	const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	vdasm_swap_8in16_MMX(dst, src, mRowLength);
+}
+
+void VDPixmapGen_B8x2_To_B8R8_MMX::Compute(void *dst0, sint32 y) {
+	uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+	const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+	const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+	vdasm_interleave_BR_MMX(dst, srcCb, srcCr, mWidth);
+}
+
+void VDPixmapGen_B8x3_To_G8B8_G8R8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
+	uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+	const uint8 *VDRESTRICT srcY = (const uint8 *VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
+	const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+	const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+	vdasm_interleave_GBGR_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
+
+	if (mWidth & 1) {
+		int w2 = mWidth >> 1;
+		srcY += mWidth;
+		srcCb += w2;
+		srcCr += w2;
+		dst += mWidth * 2;
+
+		dst[-2] = srcY[-1];
+		dst[-1] = srcCb[0];
+		dst[ 0] = 0;			// must be zero for QuickTime compatibility
+		dst[ 1] = srcCr[0];
+	}
+}
+
+void VDPixmapGen_B8x3_To_B8G8_R8G8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
+	uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+	const uint8 *VDRESTRICT srcY = (const uint8 * VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
+	const uint8 *VDRESTRICT srcCb = (const uint8 * VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+	const uint8 *VDRESTRICT srcCr = (const uint8 * VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+	vdasm_interleave_BGRG_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
+
+	if (mWidth & 1) {
+		int w2 = mWidth >> 1;
+		srcY += mWidth;
+		srcCb += w2;
+		srcCr += w2;
+		dst += mWidth * 2;
+
+		dst[-2] = srcCb[0];
+		dst[-1] = srcY[-1];
+		dst[ 0] = srcCr[0];
+		dst[ 1] = 0;			// must be zero for QuickTime compatibility
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp
new file mode 100644
index 000000000..78793f477
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp
@@ -0,0 +1,199 @@
+#include <vd2/system/halffloat.h>
+#include <vd2/system/math.h>
+#include "uberblit_v210.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_32F_To_V210::Compute(void *dst0, sint32 y) {
+	uint32 *dst = (uint32 *)dst0;
+	const float *srcR = (const float *)mpSrcR->GetRow(y, mSrcIndexR);
+	const float *srcG = (const float *)mpSrcG->GetRow(y, mSrcIndexG);
+	const float *srcB = (const float *)mpSrcB->GetRow(y, mSrcIndexB);
+
+	VDCPUCleanupExtensions();
+
+	int w6 = mWidth / 6;
+	for(sint32 i=0; i<w6; ++i) {
+		float r0 = srcR[0];
+		float r1 = srcR[1];
+		float r2 = srcR[2];
+		srcR += 3;
+
+		float b0 = srcB[0];
+		float b1 = srcB[1];
+		float b2 = srcB[2];
+		srcB += 3;
+
+		float g0 = srcG[0];
+		float g1 = srcG[1];
+		float g2 = srcG[2];
+		float g3 = srcG[3];
+		float g4 = srcG[4];
+		float g5 = srcG[5];
+		srcG += 6;
+
+		if (r0 < 0.0f) r0 = 0.0f; else if (r0 > 1.0f) r0 = 1.0f;
+		if (r1 < 0.0f) r1 = 0.0f; else if (r1 > 1.0f) r1 = 1.0f;
+		if (r2 < 0.0f) r2 = 0.0f; else if (r2 > 1.0f) r2 = 1.0f;
+		if (g0 < 0.0f) g0 = 0.0f; else if (g0 > 1.0f) g0 = 1.0f;
+		if (g1 < 0.0f) g1 = 0.0f; else if (g1 > 1.0f) g1 = 1.0f;
+		if (g2 < 0.0f) g2 = 0.0f; else if (g2 > 1.0f) g2 = 1.0f;
+		if (g3 < 0.0f) g3 = 0.0f; else if (g3 > 1.0f) g3 = 1.0f;
+		if (g4 < 0.0f) g4 = 0.0f; else if (g4 > 1.0f) g4 = 1.0f;
+		if (g5 < 0.0f) g5 = 0.0f; else if (g5 > 1.0f) g5 = 1.0f;
+		if (b0 < 0.0f) b0 = 0.0f; else if (b0 > 1.0f) b0 = 1.0f;
+		if (b1 < 0.0f) b1 = 0.0f; else if (b1 > 1.0f) b1 = 1.0f;
+		if (b2 < 0.0f) b2 = 0.0f; else if (b2 > 1.0f) b2 = 1.0f;
+
+		uint32 ir0 = (uint32)VDRoundToIntFast(r0 * 1024.0f);
+		uint32 ir1 = (uint32)VDRoundToIntFast(r1 * 1024.0f);
+		uint32 ir2 = (uint32)VDRoundToIntFast(r2 * 1024.0f);
+		uint32 ib0 = (uint32)VDRoundToIntFast(b0 * 1024.0f);
+		uint32 ib1 = (uint32)VDRoundToIntFast(b1 * 1024.0f);
+		uint32 ib2 = (uint32)VDRoundToIntFast(b2 * 1024.0f);
+		uint32 ig0 = (uint32)VDRoundToIntFast(g0 * 1024.0f);
+		uint32 ig1 = (uint32)VDRoundToIntFast(g1 * 1024.0f);
+		uint32 ig2 = (uint32)VDRoundToIntFast(g2 * 1024.0f);
+		uint32 ig3 = (uint32)VDRoundToIntFast(g3 * 1024.0f);
+		uint32 ig4 = (uint32)VDRoundToIntFast(g4 * 1024.0f);
+		uint32 ig5 = (uint32)VDRoundToIntFast(g5 * 1024.0f);
+
+		// dword 0: XX Cr0 Y0 Cb0
+		// dword 1: XX Y2 Cb1 Y1
+		// dword 2: XX Cb2 Y3 Cr1
+		// dword 3: XX Y5 Cr2 Y4
+		dst[0] = (ir0 << 20) + (ig0 << 10) + ib0;
+		dst[1] = (ig2 << 20) + (ib1 << 10) + ig1;
+		dst[2] = (ib2 << 20) + (ig3 << 10) + ir1;
+		dst[3] = (ig5 << 20) + (ir2 << 10) + ig4;
+
+		dst += 4;
+	}
+
+	int leftovers = mWidth - w6*6;
+	if (leftovers) {
+		float g0 = 0;
+		float g1 = 0;
+		float g2 = 0;
+		float g3 = 0;
+		float g4 = 0;
+		float r0 = 0;
+		float r1 = 0;
+		float r2 = 0;
+		float b0 = 0;
+		float b1 = 0;
+		float b2 = 0;
+
+		switch(leftovers) {
+			case 5:	r2 = srcR[2];
+					b2 = srcB[2];
+					g4 = srcG[4];
+			case 4:	g3 = srcG[3];
+			case 3:	r1 = srcR[1];
+					b1 = srcB[1];
+					g2 = srcG[2];
+			case 2:	g1 = srcG[1];
+			case 1:	r0 = srcR[0];
+					b0 = srcB[0];
+					g0 = srcG[0];
+		}
+
+		if (r0 < 0.0f) r0 = 0.0f; else if (r0 > 1.0f) r0 = 1.0f;
+		if (r1 < 0.0f) r1 = 0.0f; else if (r1 > 1.0f) r1 = 1.0f;
+		if (r2 < 0.0f) r2 = 0.0f; else if (r2 > 1.0f) r2 = 1.0f;
+		if (g0 < 0.0f) g0 = 0.0f; else if (g0 > 1.0f) g0 = 1.0f;
+		if (g1 < 0.0f) g1 = 0.0f; else if (g1 > 1.0f) g1 = 1.0f;
+		if (g2 < 0.0f) g2 = 0.0f; else if (g2 > 1.0f) g2 = 1.0f;
+		if (g3 < 0.0f) g3 = 0.0f; else if (g3 > 1.0f) g3 = 1.0f;
+		if (g4 < 0.0f) g4 = 0.0f; else if (g4 > 1.0f) g4 = 1.0f;
+		if (b0 < 0.0f) b0 = 0.0f; else if (b0 > 1.0f) b0 = 1.0f;
+		if (b1 < 0.0f) b1 = 0.0f; else if (b1 > 1.0f) b1 = 1.0f;
+		if (b2 < 0.0f) b2 = 0.0f; else if (b2 > 1.0f) b2 = 1.0f;
+
+		uint32 ir0 = (uint32)VDRoundToIntFast(r0 * 1024.0f);
+		uint32 ir1 = (uint32)VDRoundToIntFast(r1 * 1024.0f);
+		uint32 ir2 = (uint32)VDRoundToIntFast(r2 * 1024.0f);
+		uint32 ib0 = (uint32)VDRoundToIntFast(b0 * 1024.0f);
+		uint32 ib1 = (uint32)VDRoundToIntFast(b1 * 1024.0f);
+		uint32 ib2 = (uint32)VDRoundToIntFast(b2 * 1024.0f);
+		uint32 ig0 = (uint32)VDRoundToIntFast(g0 * 1024.0f);
+		uint32 ig1 = (uint32)VDRoundToIntFast(g1 * 1024.0f);
+		uint32 ig2 = (uint32)VDRoundToIntFast(g2 * 1024.0f);
+		uint32 ig3 = (uint32)VDRoundToIntFast(g3 * 1024.0f);
+		uint32 ig4 = (uint32)VDRoundToIntFast(g4 * 1024.0f);
+
+		// dword 0: XX Cr0 Y0 Cb0
+		// dword 1: XX Y2 Cb1 Y1
+		// dword 2: XX Cb2 Y3 Cr1
+		// dword 3: XX Y5 Cr2 Y4
+		dst[0] = (ir0 << 20) + (ig0 << 10) + ib0;
+		dst[1] = (ig2 << 20) + (ib1 << 10) + ig1;
+		dst[2] = (ib2 << 20) + (ig3 << 10) + ir1;
+		dst[3] =               (ir2 << 10) + ig4;
+		dst += 4;
+	}
+
+	// QuickTime defines the v210 format and requires zero padding in all unused samples.
+	int w48up = (mWidth + 23) / 24;
+	int w6up = (mWidth + 5) / 6;
+	int zeropad = w48up * 16 - w6up * 4;
+	memset(dst, 0, zeropad * 4);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_V210_To_32F::Start() {
+	StartWindow(((mWidth + 5) / 6) * 6 * sizeof(float), 3);
+}
+
+const void *VDPixmapGen_V210_To_32F::GetRow(sint32 y, uint32 index) {
+	return (const uint8 *)VDPixmapGenWindowBasedOneSource::GetRow(y, index) + mWindowPitch * index;
+}
+
+sint32 VDPixmapGen_V210_To_32F::GetWidth(int index) const {
+	return index == 1 ? mWidth : (mWidth + 1) >> 1;
+}
+
+uint32 VDPixmapGen_V210_To_32F::GetType(uint32 output) const {
+	return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+}
+
+void VDPixmapGen_V210_To_32F::Compute(void *dst0, sint32 y) {
+	float *dstR = (float *)dst0;
+	float *dstG = (float *)((char *)dstR + mWindowPitch);
+	float *dstB = (float *)((char *)dstG + mWindowPitch);
+	const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+	uint32 w = (mWidth + 5) / 6;
+
+	VDCPUCleanupExtensions();
+
+	// dword 0: XX Cr0 Y0 Cb0
+	// dword 1: XX Y2 Cb1 Y1
+	// dword 2: XX Cb2 Y3 Cr1
+	// dword 3: XX Y5 Cr2 Y4
+
+	for(uint32 i=0; i<w; ++i) {
+		const uint32 w0 = src[0];
+		const uint32 w1 = src[1];
+		const uint32 w2 = src[2];
+		const uint32 w3 = src[3];
+		src += 4;
+
+		dstB[0] = (float)( w0        & 0x3ff) / 1023.0f;
+		dstG[0] = (float)((w0 >> 10) & 0x3ff) / 1023.0f;
+		dstR[0] = (float)((w0 >> 20) & 0x3ff) / 1023.0f;
+		dstG[1] = (float)( w1        & 0x3ff) / 1023.0f;
+		dstB[1] = (float)((w1 >> 10) & 0x3ff) / 1023.0f;
+		dstG[2] = (float)((w1 >> 20) & 0x3ff) / 1023.0f;
+		dstR[1] = (float)( w2        & 0x3ff) / 1023.0f;
+		dstG[3] = (float)((w2 >> 10) & 0x3ff) / 1023.0f;
+		dstB[2] = (float)((w2 >> 20) & 0x3ff) / 1023.0f;
+		dstG[4] = (float)( w3        & 0x3ff) / 1023.0f;
+		dstR[2] = (float)((w3 >> 10) & 0x3ff) / 1023.0f;
+		dstG[5] = (float)((w3 >> 20) & 0x3ff) / 1023.0f;
+
+		dstR += 3;
+		dstG += 6;
+		dstB += 3;
+	}
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp
new file mode 100644
index 000000000..d34f731f1
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp
@@ -0,0 +1,35 @@
+#include "uberblit_ycbcr_x86.h"
+
+extern "C" void vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2(void *dstY, void *dstCb, void *dstCr, const void *srcRGB, uint32 count, const void *coeffs);
+
+void VDPixmapGenRGB32ToYCbCr601_SSE2::Compute(void *dst0, sint32 y) {
+	uint8 *dstCb = (uint8 *)dst0;
+	uint8 *dstY = dstCb + mWindowPitch;
+	uint8 *dstCr = dstY + mWindowPitch;
+	const uint8 *srcRGB = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+	static const __declspec(align(16)) struct {
+		sint16 rb_to_y[8];
+		sint16 rb_to_cb[8];
+		sint16 rb_to_cr[8];
+		sint16 g_to_y[8];
+		sint16 g_to_cb[8];
+		sint16 g_to_cr[8];
+		sint32 y_bias[4];
+		sint32 c_bias[4];
+	} kCoeffs={
+	//	Cb = (28784*r - 24103*g -  4681*b + 8388608 + 32768) >> 16;
+	//	Y  = (16829*r + 33039*g +  6416*b + 1048576 + 32768) >> 16;
+	//	Cr = (-9714*r - 19071*g + 28784*b + 8388608 + 32768) >> 16;
+		{   3208,  8414,   3208,  8414,   3208,  8414,   3208,  8414, },		// rb to y
+		{  -2340, 14392,  -2340, 14392,  -2340, 14392,  -2340, 14392, },		// rb to cb
+		{  16519,     0,  16519,     0,  16519,     0,  16519,     0, },		// g to y
+		{ -12050,     0, -12050,     0, -12050,     0, -12050,     0, },		// g to cb
+		{  14392, -4857,  14392, -4857,  14392, -4857,  14392, -4857, },		// rb to cr
+		{  -9535,     0,  -9535,     0,  -9535,     0,  -9535,     0, },		// g to cr
+		{ 0x084000, 0x084000, 0x084000, 0x084000, },	// y bias
+		{ 0x404000, 0x404000, 0x404000, 0x404000, },	// c bias
+	};
+
+	vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2(dstY, dstCb, dstCr, srcRGB, mWidth, &kCoeffs);
+}
author	kinddragon <kinddragon@users.sourceforge.net>	2010-05-21 04:53:52 +0400
committer	kinddragon <kinddragon@users.sourceforge.net>	2010-05-21 04:53:52 +0400
commit	37f62abd654047d060c86d6c76cd2f6862f89b94 (patch)
tree	83eb125bd86f8a685928e290e2ec929ce633bc53 /src/thirdparty/VirtualDub/Kasumi/source
parent	dae6425e0c23576dac77c3afae1dc6de22f983d5 (diff)