Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkinddragon <kinddragon@users.sourceforge.net>2010-05-21 04:53:52 +0400
committerkinddragon <kinddragon@users.sourceforge.net>2010-05-21 04:53:52 +0400
commit37f62abd654047d060c86d6c76cd2f6862f89b94 (patch)
tree83eb125bd86f8a685928e290e2ec929ce633bc53 /src/thirdparty/VirtualDub/Kasumi/source
parentdae6425e0c23576dac77c3afae1dc6de22f983d5 (diff)
DSUtil now use new VirtualDub libraries (SSE2 deinterlacing for MPEG2 decoder)
AudioSwitcher rare memory corruption fixed git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@1907 10f7b99b-c216-0410-bff0-8a66a9350fd8
Diffstat (limited to 'src/thirdparty/VirtualDub/Kasumi/source')
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64620
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm812
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm652
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm806
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm161
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm1559
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm358
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm193
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm326
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm96
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc24
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm425
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm36
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm197
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp76
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt.cpp273
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp259
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp545
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp310
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp1590
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp260
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp530
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp17
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp365
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp170
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp19
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp144
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp667
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp519
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/region.cpp1334
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample.cpp348
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp255
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp149
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp425
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp26
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp1277
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp816
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/tables.cpp204
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp1717
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp903
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp40
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp1597
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp623
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp186
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp35
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp89
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp400
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp199
-rw-r--r--src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp35
49 files changed, 22667 insertions, 0 deletions
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64 b/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64
new file mode 100644
index 000000000..e6de1eabf
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a64_resample.asm64
@@ -0,0 +1,620 @@
+; VirtualDub - Video processing and capture application
+; Graphics support library
+; Copyright (C) 1998-2004 Avery Lee
+;
+; This program is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+;
+
+ default rel
+
+ segment .rdata, align=16
+
+ align 16
+roundval dq 0000200000002000h, 0000200000002000h
+
+
+ segment .text
+
+
+%macro VDSAVE 1-*
+
+ %rep %0
+ %rotate -1
+ push %1
+ [pushreg %1]
+ %endrep
+
+%endmacro
+
+%macro VDRESTORE 1-*
+
+ %rep %0
+ pop %1
+
+ %rotate 1
+ %endrep
+
+%endmacro
+
+%macro VDSAVEXMM128 2
+%assign %%count %2 + 1 - %1
+%assign %%stkoffset 0
+%assign %%reg %1
+
+ sub rsp, %%count*16+8
+ [allocstack %%count*16]
+
+ %rep %%count
+ movdqa oword [rsp+%%stkoffset], xmm %+ %%reg
+ [savexmm128 xmm %+ %%reg, %%stkoffset]
+
+ %assign %%stkoffset %%stkoffset + 16
+ %assign %%reg %%reg + 1
+ %endrep
+%endmacro
+
+%macro VDRESTOREXMM128 2
+%assign %%count %2+1-%1
+%assign %%stkoffset %%count*16
+%assign %%reg %2
+
+ %rep %%count
+ %assign %%stkoffset %%stkoffset-16
+ movdqa xmm %+ %%reg, oword [rsp+%%stkoffset]
+
+ %assign %%reg %%reg-1
+ %endrep
+
+ add rsp, %%count*16+8
+%endmacro
+
+;-------------------------------------------------------------------------
+;
+; long vdasm_resize_table_row_SSE2(
+; Pixel *out, // rcx
+; Pixel *in, // rdx
+; int *filter, // r8
+; int filter_width, // r9d
+; PixDim w, // [rsp+40]
+; long accum, // [rsp+48]
+; long frac); // [rsp+56]
+;
+ global vdasm_resize_table_row_SSE2
+proc_frame vdasm_resize_table_row_SSE2
+
+ VDSAVE rbx, rsi, rdi, rbp, r12, r13, r14, r15
+ VDSAVEXMM128 6, 15
+end_prolog
+
+ .parms equ rsp+168+64
+
+ mov r10d, dword [.parms+40]
+ shl r10, 2
+ add rcx, r10
+ neg r10
+ shl r9d, 2 ;filter_width <<= 2
+
+ movaps xmm6, oword [roundval]
+ pxor xmm5, xmm5
+ mov rsi, rdx
+ shr rsi, 2
+
+ mov edi, [.parms+48]
+ mov eax, edi
+ shl edi, 16
+ sar rax, 16
+ add rsi, rax
+ mov ebp, [.parms+56]
+ movsxd r11, ebp
+ shl ebp, 16
+ sar r11, 16
+
+ ;register map
+ ;
+ ;eax temp coefficient pair counter
+ ;rbx temp coefficient pointer
+ ;rcx destination
+ ;rdx temp source
+ ;rsi source/4
+ ;edi accumulator
+ ;ebp fractional increment
+ ;r8 filter
+ ;r9 filter_width*4
+ ;r10 -width*4
+ ;r11 integral increment
+ ;r12
+ ;r13
+ ;r14
+ ;r15
+
+ cmp r9d, 16
+ jz .accel_4coeff
+ cmp r9d, 24
+ jz .accel_6coeff
+
+ test r9d, 8
+ jz .pixelloop_even_pairs
+ cmp r9d, 8
+ jnz .pixelloop_odd_pairs
+
+.pixelloop_single_pairs:
+ mov eax, edi
+ shr eax, 24
+ imul eax, r9d
+
+ lea rdx, [rsi*4]
+
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm0, xmm5
+ movq xmm1, qword [r8+rax]
+ pshufd xmm1, xmm1, 01000100b
+ pmaddwd xmm0, xmm1
+
+ movdqa xmm4, xmm6
+ paddd xmm4, xmm0
+
+ psrad xmm4, 14
+ packssdw xmm4, xmm4
+ packuswb xmm4, xmm4
+
+ add edi, ebp
+ adc rsi, r11
+
+ movd dword [rcx+r10], xmm4
+ add r10, 4
+ jnz .pixelloop_single_pairs
+ jmp .xit
+
+.pixelloop_odd_pairs:
+ movdqa xmm4, xmm6
+
+ mov eax, edi
+ shr eax, 24
+ imul eax, r9d
+ lea rbx, [r8+rax]
+
+ lea rdx, [rsi*4]
+ lea rax, [r9-8]
+.coeffloop_odd_pairs:
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ movd xmm2, dword [rdx+8] ;xmm2 = p2
+ movd xmm3, dword [rdx+12] ;xmm3 = p3
+ add rdx, 16
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+ movq xmm1, qword [rbx]
+ movq xmm3, qword [rbx+8]
+ add rbx, 16
+ pshufd xmm1, xmm1, 01000100b
+ pshufd xmm3, xmm3, 01000100b
+ pmaddwd xmm0, xmm1
+ pmaddwd xmm2, xmm3
+ paddd xmm0, xmm2
+ paddd xmm4, xmm0
+ sub eax, 16
+ jnz .coeffloop_odd_pairs
+
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm0, xmm5
+ movq xmm1, qword [rbx]
+ pshufd xmm1, xmm1, 01000100b
+ pmaddwd xmm0, xmm1
+ paddd xmm4, xmm0
+
+ psrad xmm4, 14
+ packssdw xmm4, xmm4
+ packuswb xmm4, xmm4
+
+ add edi, ebp
+ adc rsi, r11
+
+ movd dword [rcx+r10], xmm4
+ add r10, 4
+ jnz .pixelloop_odd_pairs
+ jmp .xit
+
+.pixelloop_even_pairs:
+ movdqa xmm4, xmm6
+
+ mov eax, edi
+ shr eax, 24
+ imul eax, r9d
+ lea rbx, [r8+rax]
+
+ lea rdx, [rsi*4]
+ mov eax, r9d
+.coeffloop_even_pairs:
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ movd xmm2, dword [rdx+8] ;xmm2 = p2
+ movd xmm3, dword [rdx+12] ;xmm3 = p3
+ add rdx, 16
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+ movq xmm1, qword [rbx]
+ movq xmm3, qword [rbx+8]
+ add rbx, 16
+ pshufd xmm1, xmm1, 01000100b
+ pshufd xmm3, xmm3, 01000100b
+ pmaddwd xmm0, xmm1
+ pmaddwd xmm2, xmm3
+ paddd xmm0, xmm2
+ paddd xmm4, xmm0
+ sub eax, 16
+ jnz .coeffloop_even_pairs
+
+ psrad xmm4, 14
+ packssdw xmm4, xmm4
+ packuswb xmm4, xmm4
+
+ add edi, ebp
+ adc rsi, r11
+
+ movd dword [rcx+r10], xmm4
+ add r10, 4
+ jnz .pixelloop_even_pairs
+
+.xit:
+ VDRESTOREXMM128 6, 15
+ VDRESTORE rbx, rsi, rdi, rbp, r12, r13, r14, r15
+ ret
+
+.accel_4coeff:
+.pixelloop_4coeff:
+ pxor xmm5, xmm5
+ movdqa xmm4, xmm6
+
+ mov eax, 0ff000000h
+ lea rdx, [rsi*4]
+ and eax, edi
+ shr eax, 20
+ lea rbx, [r8+rax]
+
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ movd xmm2, dword [rdx+8] ;xmm2 = p2
+ movd xmm3, dword [rdx+12] ;xmm3 = p3
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+ movq xmm1, qword [rbx]
+ movq xmm3, qword [rbx+8]
+ pshufd xmm1, xmm1, 01000100b
+ pshufd xmm3, xmm3, 01000100b
+ pmaddwd xmm0, xmm1
+ pmaddwd xmm2, xmm3
+ paddd xmm0, xmm2
+ paddd xmm4, xmm0
+
+ psrad xmm4, 14
+ packssdw xmm4, xmm4
+ packuswb xmm4, xmm4
+
+ add edi, ebp
+ adc rsi, r11
+
+ movd dword [rcx+r10], xmm4
+ add r10, 4
+ jnz .pixelloop_4coeff
+ jmp .xit
+
+.accel_6coeff:
+.pixelloop_6coeff:
+ pxor xmm5, xmm5
+ movdqa xmm4, xmm6
+
+ lea rdx, [rsi*4]
+ mov eax, edi
+ shr eax, 24
+ lea rax, [rax+rax*2]
+ lea rbx, [r8+rax*8]
+
+ movd xmm0, dword [rdx] ;xmm0 = p0
+ movd xmm1, dword [rdx+4] ;xmm1 = p1
+ movd xmm2, dword [rdx+8] ;xmm2 = p2
+ movd xmm3, dword [rdx+12] ;xmm3 = p3
+ movd xmm8, dword [rdx+16] ;xmm6 = p4
+ movd xmm9, dword [rdx+20] ;xmm7 = p5
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm8, xmm9
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm8, xmm5
+ movq xmm1, qword [rbx]
+ movq xmm3, qword [rbx+8]
+ movq xmm9, qword [rbx+16]
+ pshufd xmm1, xmm1, 01000100b
+ pshufd xmm3, xmm3, 01000100b
+ pshufd xmm9, xmm9, 01000100b
+ pmaddwd xmm0, xmm1
+ pmaddwd xmm2, xmm3
+ pmaddwd xmm8, xmm9
+ paddd xmm0, xmm2
+ paddd xmm4, xmm0
+ paddd xmm4, xmm8
+
+ psrad xmm4, 14
+ packssdw xmm4, xmm4
+ packuswb xmm4, xmm4
+
+ add edi, ebp
+ adc rsi, r11
+
+ movd dword [rcx+r10], xmm4
+ add r10, 4
+ jnz .pixelloop_6coeff
+ jmp .xit
+endproc_frame
+
+
+;--------------------------------------------------------------------------
+;
+; vdasm_resize_table_col_SSE2(
+; uint32 *dst, // rcx
+; const uint32 *const *srcs, // rdx
+; int *filter, // r8
+; int filter_width, // r9d
+; PixDim w, // [rsp+40] -> r10d
+; );
+;
+ global vdasm_resize_table_col_SSE2
+proc_frame vdasm_resize_table_col_SSE2
+ VDSAVE rbx, rsi, rdi, rbp, r12, r13, r14, r15
+ VDSAVEXMM128 6, 15
+end_prolog
+
+ .parms equ rsp+168+64
+
+ mov r10d, [.parms+40] ;r10d = w
+
+ pxor xmm5, xmm5
+ movdqa xmm4, oword [roundval]
+ xor rbx, rbx ;rbx = source offset
+
+ cmp r9d, 4
+ jz .accel_4coeff
+ cmp r9d, 6
+ jz .accel_6coeff
+
+ shr r9d, 1 ;r9d = filter pair count
+
+.pixelloop:
+ mov rax, rdx ;rax = row pointer table
+ mov rdi, r8 ;rdi = filter
+ mov r11d, r9d ;r11d = filter width counter
+ movdqa xmm2, xmm4
+.coeffloop:
+ mov rsi, [rax]
+
+ movd xmm0, dword [rsi+rbx]
+
+ mov rsi, [rax+8]
+ add rax, 16
+
+ movd xmm1, dword [rsi+rbx]
+ punpcklbw xmm0, xmm1
+
+ punpcklbw xmm0, xmm5
+
+ movq xmm1, qword [rdi]
+ pshufd xmm1, xmm1, 01000100b
+
+ pmaddwd xmm0, xmm1
+
+ paddd xmm2, xmm0
+
+ add rdi,8
+
+ sub r11d,1
+ jne .coeffloop
+
+ psrad xmm2,14
+ packssdw xmm2,xmm2
+ add rbx,4
+ packuswb xmm2,xmm2
+
+ movd dword [rcx],xmm2
+ add rcx,4
+ sub r10d,1
+ jne .pixelloop
+
+.xit:
+ VDRESTOREXMM128 6, 15
+ VDRESTORE rbx, rsi, rdi, rbp, r12, r13, r14, r15
+ ret
+
+.accel_4coeff:
+ mov r12, [rdx]
+ mov r13, [rdx+8]
+ mov r14, [rdx+16]
+ mov r15, [rdx+24]
+ movq xmm8, qword [r8]
+ punpcklqdq xmm8, xmm8
+ movq xmm9, qword [r8+8]
+ punpcklqdq xmm9, xmm9
+
+ sub r10d, 1
+ jc .oddpixel_4coeff
+.pixelloop_4coeff:
+ movq xmm0, qword [r12+rbx]
+ movq xmm1, qword [r13+rbx]
+ movq xmm2, qword [r14+rbx]
+ movq xmm3, qword [r15+rbx]
+
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmaddwd xmm0, xmm8
+ pmaddwd xmm1, xmm8
+ pmaddwd xmm2, xmm9
+ pmaddwd xmm3, xmm9
+
+ paddd xmm0, xmm4
+ paddd xmm1, xmm4
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+
+ psrad xmm0, 14
+ psrad xmm1, 14
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+
+ movq qword [rcx], xmm0
+ add rcx, 8
+ add rbx, 8
+ sub r10d, 2
+ ja .pixelloop_4coeff
+ jnz .xit
+.oddpixel_4coeff:
+ movd xmm0, dword [r12+rbx]
+ movd xmm1, dword [r13+rbx]
+ movd xmm2, dword [r14+rbx]
+ movd xmm3, dword [r15+rbx]
+
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+
+ pmaddwd xmm0, xmm8
+ pmaddwd xmm2, xmm9
+
+ paddd xmm0, xmm4
+ paddd xmm0, xmm2
+
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+
+ movd dword [rcx], xmm0
+
+ jmp .xit
+
+.accel_6coeff:
+ mov r12, [rdx]
+ mov r13, [rdx+8]
+ mov r14, [rdx+16]
+ mov r15, [rdx+24]
+ mov rsi, [rdx+32]
+ mov rdx, [rdx+40]
+ movq xmm10, qword [r8]
+ punpcklqdq xmm10, xmm10
+ movq xmm11, qword [r8+8]
+ punpcklqdq xmm11, xmm11
+ movq xmm12, qword [r8+16]
+ punpcklqdq xmm12, xmm12
+
+ sub r10d, 1
+ jc .oddpixel_6coeff
+.pixelloop_6coeff:
+ movq xmm0, qword [r12+rbx]
+ movq xmm1, qword [r13+rbx]
+ movq xmm2, qword [r14+rbx]
+ movq xmm3, qword [r15+rbx]
+ movq xmm8, qword [rsi+rbx]
+ movq xmm9, qword [rdx+rbx]
+
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm8, xmm9
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm9, xmm8
+
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+ punpcklbw xmm8, xmm5
+ punpckhbw xmm9, xmm5
+
+ pmaddwd xmm0, xmm10
+ pmaddwd xmm1, xmm10
+ pmaddwd xmm2, xmm11
+ pmaddwd xmm3, xmm11
+ pmaddwd xmm8, xmm12
+ pmaddwd xmm9, xmm12
+
+ paddd xmm0, xmm4
+ paddd xmm1, xmm4
+ paddd xmm2, xmm8
+ paddd xmm3, xmm9
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+
+ psrad xmm0, 14
+ psrad xmm1, 14
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+
+ movq qword [rcx], xmm0
+ add rcx, 8
+ add rbx, 8
+ sub r10d, 2
+ ja .pixelloop_6coeff
+ jnz .xit
+.oddpixel_6coeff:
+ movd xmm0, dword [r12+rbx]
+ movd xmm1, dword [r13+rbx]
+ movd xmm2, dword [r14+rbx]
+ movd xmm3, dword [r15+rbx]
+ movd xmm8, dword [rsi+rbx]
+ movd xmm9, dword [rdx+rbx]
+
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm8, xmm9
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm8, xmm5
+
+ pmaddwd xmm0, xmm10
+ pmaddwd xmm2, xmm11
+ pmaddwd xmm8, xmm12
+
+ paddd xmm0, xmm4
+ paddd xmm2, xmm8
+ paddd xmm0, xmm2
+
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+
+ movd dword [rcx], xmm0
+
+ jmp .xit
+endproc_frame
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm
new file mode 100644
index 000000000..f3503807e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb.asm
@@ -0,0 +1,812 @@
+ section .text
+
+ global _vdasm_pixblt_RGB565_to_XRGB1555
+_vdasm_pixblt_RGB565_to_XRGB1555:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-2]
+ lea ecx, [ecx+ebp-2]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ mov eax, [ecx+ebp]
+ mov ebx, 0ffc0ffc0h
+
+ and ebx, eax
+ and eax, 0001f001fh
+
+ shr ebx, 1
+
+ add eax, ebx
+
+ mov [edx+ebp], eax
+ add ebp, 4
+
+ jnc .xloop
+ jnz .noodd
+.odd:
+ movzx eax, word [ecx]
+ mov ebx, 0ffc0ffc0h
+ and ebx, eax
+ and eax, 0001f001fh
+ shr ebx, 1
+ add eax, ebx
+ mov [edx], ax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_pixblt_RGB888_to_XRGB1555
+_vdasm_pixblt_RGB888_to_XRGB1555:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ebp,[esp+20+16]
+ lea eax,[ebp+ebp]
+ lea ebx,[ebp+eax]
+ sub [esp+8+16],eax
+ sub [esp+16+16],ebx
+
+ mov edx,[esp+24+16]
+.yloop:
+ mov ebp,[esp+20+16]
+ push ebp
+ push edx
+ shr ebp,1
+ jz .checkodd
+.xloop:
+ mov eax,[esi+2] ;u
+ add esi,6 ;v
+
+ mov ebx,eax ;u
+ mov ecx,eax ;v
+ shr ebx,11 ;u
+ and ecx,00f80000h ;v
+ shr eax,17 ;u
+ and ebx,0000001fh ;v
+ shr ecx,14 ;u
+ and eax,00007c00h ;v
+ or ebx,ecx ;u
+ add edi,4 ;v
+ or ebx,eax ;u
+
+ mov ecx,[esi-6] ;v
+ mov edx,ebx ;u
+ mov eax,ecx ;v
+
+ shl edx,16 ;u
+ mov ebx,ecx ;v
+ shr ebx,3 ;u
+ and ecx,0000f800h ;v
+ shr eax,9 ;u
+ and ebx,0000001fh ;v
+ shr ecx,6 ;u
+ and eax,00007c00h ;v
+ or eax,ecx ;u
+ or edx,ebx ;v
+ or edx,eax ;u
+ sub ebp,1 ;v
+ mov [edi-4],edx ;u
+ jne .xloop ;v
+.checkodd:
+ pop edx
+ pop ebp
+ and ebp,1
+ jz .noodd
+ movzx eax,word [esi]
+ movzx ebx,byte [esi+2]
+ shl ebx,16
+ add esi,3
+ add eax,ebx
+
+ mov ebx,eax
+ mov ecx,eax
+ shr ebx,3
+ and ecx,0000f800h
+ shr eax,9
+ and ebx,0000001fh
+ shr ecx,6
+ and eax,00007c00h
+ or ebx,ecx
+ or ebx,eax
+ mov [edi+0],bl
+ mov [edi+1],bh
+ add edi,2
+.noodd:
+
+ add esi,[esp+16+16]
+ add edi,[esp+ 8+16]
+
+ sub edx,1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+
+ ret
+
+ global _vdasm_pixblt_XRGB8888_to_XRGB1555
+_vdasm_pixblt_XRGB8888_to_XRGB1555:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ mov edx, [esp+4+16]
+ add ebp, ebp
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-2]
+ lea ecx, [ecx+ebp*2-4]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ mov eax, [ecx+ebp*2]
+ mov ebx, 00f80000h
+ and ebx, eax
+ mov esi, eax
+ shr ebx, 9
+ and esi, 0000f800h
+ shr esi, 6
+ and eax, 000000f8h
+ shr eax, 3
+ add ebx, esi
+ mov esi, [ecx+ebp*2+4]
+ add eax, ebx
+ mov ebx, esi
+ and esi, 00f80000h
+ shl esi, 7
+ mov edi, ebx
+ and edi, 0000f800h
+ add eax, esi
+ shl edi, 10
+ and ebx, 000000f8h
+ shl ebx, 13
+ add eax, edi
+ add eax, ebx
+ mov [edx+ebp], eax
+ add ebp, 4
+ jnc .xloop
+ jnz .noodd
+.odd:
+ mov eax, [ecx]
+ mov ebx, 00f80000h
+ and ebx, eax
+ mov esi, eax
+ shr ebx, 9
+ and esi, 0000f800h
+ shr esi, 6
+ and eax, 000000f8h
+ shr eax, 3
+ add ebx, esi
+ add eax, ebx
+ mov [edx], ax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec dword [esp+24+16]
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_pixblt_XRGB1555_to_RGB565
+_vdasm_pixblt_XRGB1555_to_RGB565:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-2]
+ lea ecx, [ecx+ebp-2]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ mov eax, [ecx+ebp]
+ mov ebx, 02000200h
+
+ mov esi, eax
+ and ebx, eax
+
+ shr ebx, 4
+ and esi, 0ffe0ffe0h
+
+ add eax, esi
+
+ add eax, ebx
+
+ mov [edx+ebp], eax
+ add ebp, 4
+
+ jnc .xloop
+ jnz .noodd
+.odd:
+ movzx eax, word [ecx]
+ mov ebx, 02000200h
+ mov esi, eax
+ and ebx, eax
+ shr ebx, 4
+ and esi, 0ffe0ffe0h
+ add eax, esi
+ add eax, ebx
+ mov [edx], ax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_pixblt_RGB888_to_RGB565
+_vdasm_pixblt_RGB888_to_RGB565:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ebp,[esp+20+16]
+ lea eax,[ebp+ebp]
+ lea ebx,[ebp+eax]
+ sub [esp+8+16],eax
+ sub [esp+16+16],ebx
+
+ mov edx,[esp+24+16]
+.yloop:
+ mov ebp,[esp+20+16]
+ push ebp
+ push edx
+ shr ebp,1
+ jz .checkodd
+.xloop:
+ mov eax,[esi+2] ;u
+ add esi,6 ;v
+
+ mov ebx,eax ;u
+ mov ecx,eax ;v
+ shr ebx,11 ;u
+ and ecx,00fc0000h ;v
+ shr eax,16 ;u
+ and ebx,0000001fh ;v
+ shr ecx,13 ;u
+ and eax,0000f800h ;v
+ or ebx,ecx ;u
+ add edi,4 ;v
+ or ebx,eax ;u
+
+ mov ecx,[esi-6] ;v
+ mov edx,ebx ;u
+ mov eax,ecx ;v
+
+ shl edx,16 ;u
+ mov ebx,ecx ;v
+ shr ebx,3 ;u
+ and ecx,0000fc00h ;v
+ shr eax,8 ;u
+ and ebx,0000001fh ;v
+ shr ecx,5 ;u
+ and eax,0000f800h ;v
+ or eax,ecx ;u
+ or edx,ebx ;v
+ or edx,eax ;u
+ sub ebp,1 ;v
+ mov [edi-4],edx ;u
+ jne .xloop ;v
+.checkodd:
+ pop edx
+ pop ebp
+ and ebp,1
+ jz .noodd
+ movzx eax,word [esi]
+ movzx ebx,byte [esi+2]
+ shl ebx,16
+ add esi,3
+ add eax,ebx
+
+ mov ebx,eax
+ mov ecx,eax
+ shr ebx,3
+ and ecx,0000fc00h
+ shr eax,8
+ and ebx,0000001fh
+ shr ecx,5
+ and eax,0000f800h
+ or ebx,ecx
+ or ebx,eax
+ mov [edi+0],bl
+ mov [edi+1],bh
+ add edi,2
+.noodd:
+
+ add esi,[esp+16+16]
+ add edi,[esp+ 8+16]
+
+ sub edx,1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+
+ ret
+
+ global _vdasm_pixblt_XRGB8888_to_RGB565
+_vdasm_pixblt_XRGB8888_to_RGB565:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ mov edx, [esp+4+16]
+ add ebp, ebp
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-2]
+ lea ecx, [ecx+ebp*2-4]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ mov eax, [ecx+ebp*2]
+ mov ebx, 00f80000h
+ and ebx, eax
+ mov esi, eax
+ shr ebx, 8
+ and esi, 0000fc00h
+ shr esi, 5
+ and eax, 000000f8h
+ shr eax, 3
+ add ebx, esi
+ mov esi, [ecx+ebp*2+4]
+ add eax, ebx
+ mov ebx, esi
+ and esi, 00f80000h
+ shl esi, 8
+ mov edi, ebx
+ and edi, 0000fc00h
+ add eax, esi
+ shl edi, 11
+ and ebx, 000000f8h
+ shl ebx, 13
+ add eax, edi
+ add eax, ebx
+ mov [edx+ebp], eax
+ add ebp, 4
+ jnc .xloop
+ jnz .noodd
+.odd:
+ mov eax, [ecx]
+ mov ebx, 00f80000h
+ and ebx, eax
+ mov esi, eax
+ shr ebx, 8
+ and esi, 0000fc00h
+ shr esi, 5
+ and eax, 000000f8h
+ shr eax, 3
+ add ebx, esi
+ add eax, ebx
+ mov [edx], ax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec dword [esp+24+16]
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_pixblt_XRGB8888_to_RGB888
+_vdasm_pixblt_XRGB8888_to_RGB888:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ecx,[esp+20+16]
+ lea eax,[ecx+ecx*2]
+ lea ebx,[ecx*4]
+ sub [esp+8+16],eax
+ sub [esp+16+16],ebx
+
+ mov edx,[esp+24+16]
+.yloop:
+ mov ecx,[esp+20+16]
+ push ecx
+ push edx
+ shr ecx,2
+ jz .checkodd
+.xloop:
+ mov eax,[esi] ;EAX = xxr0g0b0
+ mov ebx,[esi+4] ;EBX = xxr1g1b1
+ mov edx,ebx ;EDX = xxr1g1b1
+ mov ebp,[esi+8] ;EBP = xxr2g2b2
+ shl ebx,24 ;EBX = b1000000
+ and eax,00ffffffh ;EAX = 00r0g0b0
+ shr edx,8 ;EDX = 00xxr1g1
+ or eax,ebx ;EAX = b1r0g0b0
+ mov [edi],eax
+ mov ebx,ebp ;EBX = xxr2g2b2
+ shl ebp,16 ;EBP = g2b20000
+ and edx,0000ffffh ;EDX = 0000r1g1
+ or ebp,edx ;EBP = g2b2r1g1
+ mov eax,[esi+12] ;EAX = xxr3g3b3
+ shr ebx,16 ;EBX = 0000xxr2
+ add edi,12
+ shl eax,8 ;EAX = r3g3b300
+ and ebx,000000ffh ;EBX = 000000r2
+ or eax,ebx ;EAX = r3g3b3r2
+ mov [edi+4-12],ebp
+ add esi,16
+ mov [edi+8-12],eax
+ sub ecx,1
+ jne .xloop
+.checkodd:
+ pop edx
+ pop ecx
+ and ecx,3
+ jz .noodd
+.oddloop:
+ mov eax,[esi]
+ add esi,4
+ mov [edi],ax
+ shr eax,16
+ mov [edi+2],al
+ add edi,3
+ sub ecx,1
+ jnz .oddloop
+.noodd:
+ add esi,[esp+16+16]
+ add edi,[esp+ 8+16]
+
+ sub edx,1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_pixblt_XRGB1555_to_XRGB8888
+_vdasm_pixblt_XRGB1555_to_XRGB8888:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp*2-4]
+ lea ecx, [ecx+ebp-2]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ mov eax, [ecx+ebp]
+ mov ebx, 00007c00h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 9
+ and esi, 000003e0h
+ shl esi, 6
+ mov edi, eax
+ and eax, 0000001fh
+ add ebx, esi
+ shl eax, 3
+ mov esi, edi
+ shr edi, 7
+ add eax, ebx
+ and edi, 00f80000h
+ mov ebx, esi
+ shr esi, 13
+ and ebx, 03e00000h
+ shr ebx, 10
+ and esi, 000000f8h
+ add ebx, edi
+ add ebx, esi
+ mov edi, eax
+ and eax, 00e0e0e0h
+ shr eax, 5
+ mov esi, ebx
+ shr ebx, 5
+ add eax, edi
+ and ebx, 00070707h
+ add ebx, esi
+ mov [edx+ebp*2], eax
+ mov [edx+ebp*2+4], ebx
+ add ebp, 4
+ jnc .xloop
+ jnz .noodd
+.odd:
+ movzx eax, word [ecx]
+ mov ebx, 00007c00h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 9
+ and esi, 000003e0h
+ shl esi, 6
+ and eax, 0000001fh
+ shl eax, 3
+ add ebx, esi
+ add eax, ebx
+ mov ebx, 00e0e0e0h
+ and ebx, eax
+ shr ebx, 5
+ add eax, ebx
+ mov [edx], eax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec dword [esp+24+16]
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_pixblt_RGB565_to_XRGB8888
+_vdasm_pixblt_RGB565_to_XRGB8888:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov ebp, [esp+20+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp*2-4]
+ lea ecx, [ecx+ebp-2]
+ neg ebp
+ mov [esp+20+16], ebp
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 2
+ jbe .odd
+
+.xloop:
+ movzx eax, word [ecx+ebp]
+ mov ebx, 0000f800h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 8
+ mov edi, eax
+ shl eax, 3
+ and esi, 000007e0h
+ and eax, 000000f8h
+ add ebx, eax
+ shl esi, 5
+ mov eax, ebx
+ shr ebx, 5
+ and edi, 00000600h
+ shr edi, 1
+ and ebx, 00070007h
+ add esi, edi
+ add eax, ebx
+ add eax, esi
+ mov [edx+ebp*2], eax
+
+ movzx eax, word [ecx+ebp+2]
+ mov ebx, 0000f800h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 8
+ mov edi, eax
+ shl eax, 3
+ and esi, 000007e0h
+ and eax, 000000f8h
+ add ebx, eax
+ shl esi, 5
+ mov eax, ebx
+ shr ebx, 5
+ and edi, 00000600h
+ shr edi, 1
+ and ebx, 00070007h
+ add esi, edi
+ add eax, ebx
+ add eax, esi
+ mov [edx+ebp*2+4], eax
+
+ add ebp, 4
+
+ jnc .xloop
+ jnz .noodd
+.odd:
+ movzx eax, word [ecx]
+ mov ebx, 0000f800h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 8
+ mov edi, eax
+ shl eax, 3
+ and esi, 000007e0h
+ and eax, 000000f8h
+ add ebx, eax
+ shl esi, 5
+ mov eax, ebx
+ shr ebx, 5
+ and edi, 00000600h
+ shr edi, 1
+ and ebx, 00070007h
+ add esi, edi
+ add eax, ebx
+ add eax, esi
+ mov [edx], eax
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec dword [esp+24+16]
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_pixblt_RGB888_to_XRGB8888
+_vdasm_pixblt_RGB888_to_XRGB8888:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ecx,[esp+20+16]
+ lea eax,[ecx+ecx*2]
+ lea ebx,[ecx*4]
+ sub [esp+8+16],ebx
+ sub [esp+16+16],eax
+
+ mov edx,[esp+24+16]
+.yloop:
+ mov ebp,[esp+20+16]
+ shr ebp,2
+ push edx
+ jz .checkodd
+.xloop:
+ mov eax,[esi] ;EAX: b1r0g0b0
+ mov ebx,[esi+4] ;EBX: g2b2r1g1
+
+ mov [edi],eax
+ mov ecx,ebx ;ECX: g2b2r1g1
+
+ shr eax,24 ;EAX: ------b1
+ mov edx,[esi+8] ;EDX: r3g3b3r2
+
+ shr ecx,16 ;ECX: ----g2b2
+ add edi,16
+
+ shl ebx,8 ;EBX: b2r1g1--
+ add esi,12
+
+ or eax,ebx ;EAX: b2r1g1b1
+ mov ebx,edx ;EBX: r3g3b3r2
+
+ shr ebx,8 ;EBX: --r3g3b3
+ mov [edi+4-16],eax
+
+ shl edx,16 ;EDX: b3r2----
+ mov [edi+12-16],ebx
+
+ or edx,ecx ;EDX: b3r2g2b2
+ sub ebp,1
+
+ mov [edi+8-16],edx
+ jne .xloop
+
+.checkodd:
+ pop edx
+ mov ebx,[esp+20+16]
+ and ebx,3
+ jz .noodd
+.oddloop:
+ mov ax,[esi]
+ mov cl,[esi+2]
+ mov [edi],ax
+ mov [edi+2],cl
+ add esi,3
+ add edi,4
+ sub ebx,1
+ jne .oddloop
+.noodd:
+
+ add esi,[esp+16+16]
+ add edi,[esp+ 8+16]
+
+ sub edx,1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm
new file mode 100644
index 000000000..6a00d826f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb2yuv_mmx.asm
@@ -0,0 +1,652 @@
+ section .rdata, rdata
+
+y_co dq 0004a004a004a004ah
+cr_co_r dq 000cc00cc00cc00cch
+cb_co_b dq 00081008100810081h ;note: divided by two
+cr_co_g dq 0ff98ff98ff98ff98h
+cb_co_g dq 0ffceffceffceffceh
+y_bias dq 0fb7afb7afb7afb7ah
+c_bias dq 0ff80ff80ff80ff80h
+interp dq 06000400020000000h
+rb_mask_555 dq 07c1f7c1f7c1f7c1fh
+g_mask_555 dq 003e003e003e003e0h
+rb_mask_565 dq 0f81ff81ff81ff81fh
+g_mask_565 dq 007e007e007e007e0h
+
+cr_coeff dq 000003313e5fc0000h
+cb_coeff dq 000000000f377408dh
+rgb_bias dq 000007f2180887eebh
+
+msb_inv dq 08000800080008000h
+
+ section .text
+
+;============================================================================
+
+%macro YUV411PLANAR_TO_RGB_PROLOG 0
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+16]
+ mov ecx, [esp+8+16]
+ mov edx, [esp+12+16]
+ mov ebx, [esp+16+16]
+ mov ebp, [esp+20+16]
+
+ pxor mm7, mm7
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_CORE_MMX 0
+ movd mm0, dword [ecx] ;mm0 = Y3Y2Y1Y0
+ add ecx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ movq mm1, mm0
+ pmullw mm0, [y_co]
+ paddw mm1, [y_bias]
+ paddsw mm0, mm0
+ paddsw mm0, mm1
+
+ movzx esi, word [ebx]
+ movzx edi, word [edx]
+ add ebx, 1
+ add edx, 1
+
+ movd mm1, esi
+ movd mm2, edi
+
+ punpcklbw mm1, mm7
+ paddw mm1, [c_bias]
+ punpcklwd mm1, mm1
+ movq mm3, mm1
+ punpckldq mm1, mm1
+ punpckhdq mm3, mm3
+
+ punpcklbw mm2, mm7
+ paddw mm2, [c_bias]
+ punpcklwd mm2, mm2
+ movq mm4, mm2
+ punpckldq mm2, mm2
+ punpckhdq mm4, mm4
+
+ psubw mm3, mm1
+ psubw mm4, mm2
+ paddw mm3, mm3
+ paddw mm4, mm4
+
+ pmulhw mm3, [interp]
+ pmulhw mm4, [interp]
+
+ paddw mm1, mm3
+ paddw mm2, mm4
+
+ movq mm3, mm1
+ movq mm4, mm2
+
+ pmullw mm1, [cr_co_r]
+ pmullw mm2, [cb_co_b]
+ pmullw mm3, [cr_co_g]
+ pmullw mm4, [cb_co_g]
+
+ paddsw mm2, mm2
+ paddsw mm1, mm0
+ paddsw mm3, mm4
+ paddsw mm2, mm0
+ paddsw mm3, mm0
+
+ psraw mm1, 7
+ psraw mm2, 7
+ psraw mm3, 7
+
+ packuswb mm1, mm1
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_CORE_ISSE 0
+ movd mm0, dword [ecx] ;mm0 = Y3Y2Y1Y0
+ add ecx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ movq mm1, mm0
+ pmullw mm0, [y_co]
+ paddw mm1, [y_bias]
+ paddsw mm0, mm0
+ paddsw mm0, mm1
+
+ movzx esi, word [ebx]
+ movzx edi, word [edx]
+ add ebx, 1
+ add edx, 1
+
+ movd mm1, esi
+ movd mm2, edi
+
+ punpcklbw mm1, mm7
+ paddw mm1, [c_bias]
+ pshufw mm3, mm1, 01010101b
+ pshufw mm1, mm1, 00000000b
+
+ punpcklbw mm2, mm7
+ paddw mm2, [c_bias]
+ pshufw mm4, mm2, 01010101b
+ pshufw mm2, mm2, 00000000b
+
+ psubw mm3, mm1
+ psubw mm4, mm2
+ paddw mm3, mm3
+ paddw mm4, mm4
+
+ pmulhw mm3, [interp]
+ pmulhw mm4, [interp]
+
+ paddw mm1, mm3
+ paddw mm2, mm4
+
+ psllw mm1, 3
+ psllw mm2, 3
+
+ movq mm3, [cr_co_g]
+ movq mm4, [cb_co_g]
+
+ pmullw mm3, mm1
+ pmullw mm4, mm2
+ pmullw mm1, [cr_co_r]
+ pmullw mm2, [cb_co_b]
+
+ paddsw mm2, mm2
+ paddsw mm1, mm0
+ paddsw mm3, mm4
+ paddsw mm2, mm0
+ paddsw mm3, mm0
+
+ psraw mm1, 7
+ psraw mm2, 7
+ psraw mm3, 7
+
+ packuswb mm1, mm1
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+%endmacro
+
+%macro YUV411PLANAR_TO_RGB_EPILOG 0
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+%endmacro
+
+ global _vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX:
+ YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+ YUV411PLANAR_TO_RGB_CORE_MMX
+
+ psrlw mm1, 1
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 2
+ pand mm2, [rb_mask_555]
+ pand mm3, [g_mask_555]
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+ global _vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX:
+ YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+ YUV411PLANAR_TO_RGB_CORE_MMX
+
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 3
+ pand mm2, [rb_mask_565]
+ pand mm3, [g_mask_565]
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+ global _vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX
+_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX:
+ YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+ YUV411PLANAR_TO_RGB_PROLOG
+
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ movq mm1, mm2
+ punpcklbw mm1, mm3
+ punpckhbw mm2, mm3
+
+ movq [eax], mm1
+ movq [eax+8], mm2
+ add eax, 16
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+ global _vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE:
+ YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+ YUV411PLANAR_TO_RGB_CORE_ISSE
+
+ psrlw mm1, 1
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 2
+ pand mm2, [rb_mask_555]
+ pand mm3, [g_mask_555]
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+ global _vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE:
+ YUV411PLANAR_TO_RGB_PROLOG
+.xloop:
+ YUV411PLANAR_TO_RGB_CORE_ISSE
+
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 3
+ pand mm2, [rb_mask_565]
+ pand mm3, [g_mask_565]
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;============================================================================
+
+ global _vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE
+_vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+16]
+ mov ecx, [esp+8+16]
+ mov edx, [esp+12+16]
+ mov ebx, [esp+16+16]
+ mov ebp, [esp+20+16]
+
+ pxor mm7, mm7
+
+ movzx esi, byte [ebx]
+ movzx edi, byte [edx]
+ add ebx, 1
+ add edx, 1
+
+ movd mm1, esi
+ movd mm2, edi
+
+ psllw mm1, 3
+ psllw mm2, 3
+
+ pshufw mm5, mm1, 0
+ pshufw mm6, mm2, 0
+
+ pmulhw mm5, [cr_coeff]
+ pmulhw mm6, [cb_coeff]
+ paddw mm6, mm5
+ paddw mm6, [rgb_bias]
+
+.xloop:
+ movd mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+ add ecx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ psllw mm0, 3
+ pmulhw mm0, [y_co]
+ pxor mm0, [msb_inv]
+
+ movzx esi, byte [ebx]
+ movzx edi, byte [edx]
+ add ebx, 1
+ add edx, 1
+
+ movd mm1, esi
+ movd mm2, edi
+
+ psllw mm1, 3
+ psllw mm2, 3
+
+ pshufw mm1, mm1, 0
+ pshufw mm2, mm2, 0
+
+ pmulhw mm1, [cr_coeff]
+ pmulhw mm2, [cb_coeff]
+ paddw mm1, mm2
+ paddw mm1, [rgb_bias]
+
+ movq mm2, mm1
+ pavgw mm2, mm6 ;mm2 = 1/2
+ pshufw mm3, mm0, 00000000b
+ paddw mm3, mm6
+ pavgw mm6, mm2 ;mm1 = 1/4
+ pshufw mm4, mm0, 01010101b
+ paddw mm4, mm6
+ packuswb mm3, mm4
+ movq [eax], mm3
+
+ pshufw mm3, mm0, 10101010b
+ paddw mm3, mm2
+ pshufw mm0, mm0, 11111111b
+ pavgw mm2, mm1 ;mm2 = 3/4
+ paddw mm2, mm0
+ packuswb mm3, mm2
+ movq [eax+8], mm3
+
+ movq mm6, mm1
+
+ add eax, 16
+
+ sub ebp, 1
+ jne .xloop
+
+ YUV411PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+%macro YUV444PLANAR_TO_RGB_PROLOG 0
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+16]
+ mov ecx, [esp+8+16]
+ mov edx, [esp+12+16]
+ mov ebx, [esp+16+16]
+ mov ebp, [esp+20+16]
+%endmacro
+
+%macro YUV444PLANAR_TO_RGB_CORE 0
+ movq mm3, mm0
+ pmullw mm0, [y_co]
+ paddw mm1, [c_bias]
+ paddw mm2, [c_bias]
+ paddw mm0, [y_bias]
+ paddsw mm0, mm0
+ paddsw mm0, mm3
+
+ movq mm3, [cr_co_g]
+ movq mm4, [cb_co_g]
+
+ pmullw mm3, mm1
+ pmullw mm4, mm2
+ pmullw mm1, [cr_co_r]
+ pmullw mm2, [cb_co_b]
+
+ paddsw mm2, mm2
+ paddsw mm1, mm0
+ paddsw mm3, mm4
+ paddsw mm2, mm0
+ paddsw mm3, mm0
+
+ psraw mm1, 7
+ psraw mm2, 7
+ psraw mm3, 7
+
+ packuswb mm1, mm1
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+%endmacro
+
+%macro YUV444PLANAR_TO_RGB_EPILOG 0
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+%endmacro
+
+;==========================================================================
+
+ global _vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX:
+ YUV444PLANAR_TO_RGB_PROLOG
+
+ pxor mm7, mm7
+ movq mm5, [rb_mask_555]
+ movq mm6, [g_mask_555]
+
+ sub ebp, 3
+ jbe .oddcheck
+.xloop4:
+ movd mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+ movd mm1, dword [ebx]
+ movd mm2, dword [edx]
+ add ecx, 4
+ add ebx, 4
+ add edx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ psrlw mm1, 1
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 2
+ pand mm2, mm5
+ pand mm3, mm6
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 4
+ ja .xloop4
+.oddcheck:
+ add ebp, 3
+ jz .noodd
+.xloop:
+ movzx edi, byte [ecx] ;mm0 = Y3Y2Y1Y0
+ movd mm0, edi
+ movzx edi, byte [ebx]
+ movd mm1, edi
+ movzx edi, byte [edx]
+ movd mm2, edi
+ add ecx, 1
+ add ebx, 1
+ add edx, 1
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ psrlw mm1, 1
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 2
+ pand mm2, mm5
+ pand mm3, mm6
+ por mm2, mm3
+
+ movd edi, mm2
+ mov [eax], di
+ add eax, 2
+
+ sub ebp, 1
+ jnz .xloop
+.noodd:
+ YUV444PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+ global _vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX:
+ YUV444PLANAR_TO_RGB_PROLOG
+
+ pxor mm7, mm7
+ movq mm5, [rb_mask_565]
+ movq mm6, [g_mask_565]
+
+ sub ebp, 3
+ jbe .oddcheck
+.xloop4:
+ movd mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+ movd mm1, dword [ebx]
+ movd mm2, dword [edx]
+ add ecx, 4
+ add ebx, 4
+ add edx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 3
+ pand mm2, mm5
+ pand mm3, mm6
+ por mm2, mm3
+
+ movq [eax], mm2
+ add eax, 8
+
+ sub ebp, 4
+ ja .xloop4
+.oddcheck:
+ add ebp, 3
+ jz .noodd
+.xloop:
+ movzx edi, byte [ecx] ;mm0 = Y3Y2Y1Y0
+ movd mm0, edi
+ movzx edi, byte [ebx]
+ movd mm1, edi
+ movzx edi, byte [edx]
+ movd mm2, edi
+ add ecx, 1
+ add ebx, 1
+ add edx, 1
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ psrlw mm2, 3
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ psllw mm3, 3
+ pand mm2, mm5
+ pand mm3, mm6
+ por mm2, mm3
+
+ movd edi, mm2
+ mov [eax], di
+ add eax, 2
+
+ sub ebp, 1
+ jnz .xloop
+.noodd:
+ YUV444PLANAR_TO_RGB_EPILOG
+
+;==========================================================================
+
+ global _vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX
+_vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX:
+ YUV444PLANAR_TO_RGB_PROLOG
+
+ pxor mm7, mm7
+
+ sub ebp, 3
+ jbe .oddcheck
+.xloop4:
+ movd mm0, dword [ecx];mm0 = Y3Y2Y1Y0
+ movd mm1, dword [ebx]
+ movd mm2, dword [edx]
+ add ecx, 4
+ add ebx, 4
+ add edx, 4
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ movq mm1, mm2
+ punpcklbw mm1, mm3
+ punpckhbw mm2, mm3
+
+ movq [eax], mm1
+ movq [eax+8], mm2
+ add eax, 16
+
+ sub ebp, 4
+ ja .xloop4
+.oddcheck:
+ add ebp, 3
+ jz .noodd
+.xloop:
+ movzx edi, byte [ecx] ;mm0 = Y3Y2Y1Y0
+ movd mm0, edi
+ movzx edi, byte [ebx]
+ movd mm1, edi
+ movzx edi, byte [edx]
+ movd mm2, edi
+ add ecx, 1
+ add ebx, 1
+ add edx, 1
+ punpcklbw mm0, mm7 ;mm0 = Y3 | Y2 | Y1 | Y0
+
+ YUV444PLANAR_TO_RGB_CORE
+
+ punpcklbw mm2, mm1
+ punpcklbw mm3, mm3
+ punpcklbw mm2, mm3
+
+ movd dword [eax], mm2
+ add eax, 4
+
+ sub ebp, 1
+ jnz .xloop
+.noodd:
+ YUV444PLANAR_TO_RGB_EPILOG
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm
new file mode 100644
index 000000000..aa0b99987
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltrgb_mmx.asm
@@ -0,0 +1,806 @@
+ section .rdata, rdata
+
+x07b dq 00707070707070707h
+x0200w dq 00200020002000200h
+x001fw dq 0001f001f001f001fh
+xffc0w dq 0ffc0ffc0ffc0ffc0h
+xffe0w dq 0ffe0ffe0ffe0ffe0h
+x2080w dq 02080208020802080h
+x4200w dq 04200420042004200h
+rb_mask5 dq 000f800f800f800f8h
+g_mask5 dq 00000f8000000f800h
+g_mask6 dq 00000fc000000fc00h
+rb_mul_565 dq 02000000420000004h
+rb_mul_555 dq 02000000820000008h
+r_mask_555 dq 07c007c007c007c00h
+g_mask_555 dq 003e003e003e003e0h
+b_mask_555 dq 0001f001f001f001fh
+r_mask_565 dq 0f800f800f800f800h
+g_mask_565 dq 007e007e007e007e0h
+b_mask_565 dq 0001f001f001f001fh
+
+%macro prologue 1
+ push ebx
+ push esi
+ push edi
+ push ebp
+ ;.fpo (0,%1,4,4,1,0)
+%endmacro
+
+%macro epilogue 0
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+%endmacro
+
+ section .text
+
+ global _vdasm_pixblt_RGB565_to_XRGB1555_MMX
+_vdasm_pixblt_RGB565_to_XRGB1555_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-6]
+ lea ecx, [ecx+ebp-6]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5, [x001fw]
+ movq mm4, [xffc0w]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 6
+ jbe .odd
+
+.xloop:
+ movq mm0, [ecx+ebp]
+ movq mm1, mm5
+ pand mm1, mm0
+ pand mm0, mm4
+ psrlq mm0, 1
+ paddw mm0, mm1
+ movq [edx+ebp], mm0
+ add ebp, 8
+ jnc .xloop
+
+ sub ebp, 6
+ jz .noodd
+.odd:
+ movzx eax, word [ecx+ebp+6]
+ mov ebx, 0001f001fh
+ and ebx, eax
+ and eax, 0ffc0ffc0h
+ shr eax, 1
+ add eax, ebx
+ mov [edx+ebp+6], ax
+ add ebp, 2
+ jnz .odd
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+ global _vdasm_pixblt_XRGB8888_to_XRGB1555_MMX
+_vdasm_pixblt_XRGB8888_to_XRGB1555_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-14]
+ lea ecx, [ecx+ebp*2-28]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5,[rb_mul_555]
+ movq mm6,[rb_mask5]
+ movq mm7,[g_mask5]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 14
+ jbe .odd
+
+ ;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
+ ;Application Notes.
+
+ movq mm0,[ecx+ebp*2] ;allocate 0 (0123)
+ movq mm2,mm0 ;allocate 2 (0 23)
+
+ movq mm1,[ecx+ebp*2+8] ;allocate 1 (0123)
+ movq mm3,mm1 ;allocate 3 (0123)
+ pand mm0,mm6
+ pmaddwd mm0,mm5
+ pand mm1,mm6
+ pmaddwd mm1,mm5
+ pand mm2,mm7
+ pand mm3,mm7
+ jmp .xloopstart
+
+ align 16
+.xloop:
+ movq mm0,[ecx+ebp*2] ;allocate 0 (01234)
+ por mm4,mm2 ;free 2 (01 34)
+
+ por mm3,mm1 ;free 3 (01 34)
+ movq mm2,mm0 ;allocate 2 (0 234)
+
+ movq mm1,[ecx+ebp*2+8] ;allocate 1 (01234)
+ psrld mm4,6
+
+ psrld mm3,6
+ pand mm0,mm6
+
+ packssdw mm4,mm3 ;free 3 (012 4)
+ movq mm3,mm1 ;allocate 3 (01234)
+
+ pmaddwd mm0,mm5
+ pand mm1,mm6
+
+ pmaddwd mm1,mm5
+ pand mm2,mm7
+
+ movq [edx+ebp-8],mm4 ;free 4 (0123 )
+ pand mm3,mm7
+
+.xloopstart:
+ movq mm4,[ecx+ebp*2+16] ;allocate 4 (01234)
+ por mm0,mm2 ;free 2 (01 34)
+
+ por mm1,mm3 ;free 3 (01 4)
+ psrld mm0,6
+
+ movq mm3,[ecx+ebp*2+24] ;allocate 3 (01 34)
+ movq mm2,mm4 ;allocate 2 (01234)
+
+ psrld mm1,6
+ pand mm4,mm6
+
+ packssdw mm0,mm1 ;free 1 (0 234)
+ movq mm1,mm3 ;allocate 1 (01234)
+
+ movq [edx+ebp],mm0 ;free 0 ( 1234)
+ pand mm3,mm6
+
+ pmaddwd mm4,mm5
+ add ebp,16
+
+ pmaddwd mm3,mm5
+ pand mm2,mm7
+
+ pand mm1,mm7
+ jnc .xloop
+
+ por mm4,mm2 ;free 2 (01 34)
+ por mm3,mm1 ;free 3 (01 34)
+ psrld mm4,6
+ psrld mm3,6
+ packssdw mm4,mm3 ;free 3 (012 4)
+ movq [edx+ebp-8],mm4 ;free 4 (0123 )
+
+.odd:
+ sub ebp, 14
+ jz .noodd
+.oddloop:
+ mov eax, [ecx+ebp*2+28]
+ mov ebx, 00f80000h
+ mov esi, eax
+ and ebx, eax
+ shr ebx, 9
+ and esi, 0000f800h
+ shr esi, 6
+ and eax, 000000f8h
+ shr eax, 3
+ add esi, ebx
+ add eax, esi
+ mov [edx+ebp+14], ax
+ add ebp, 2
+ jnz .oddloop
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+ global _vdasm_pixblt_XRGB1555_to_RGB565_MMX
+_vdasm_pixblt_XRGB1555_to_RGB565_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-6]
+ lea ecx, [ecx+ebp-6]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5, [x0200w]
+ movq mm4, [xffe0w]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 6
+ jbe .odd
+
+.xloop:
+ movq mm0, [ecx+ebp]
+ movq mm1, mm4
+ movq mm2, mm0
+ pand mm1, mm0
+ pand mm0, mm5
+ paddw mm1, mm2
+ psrlq mm0, 4
+ paddw mm0, mm1
+ movq [edx+ebp], mm0
+ add ebp, 8
+ jnc .xloop
+
+.odd:
+ sub ebp, 6
+ jz .noodd
+.oddloop:
+ movzx eax, word [ecx+ebp+6]
+ mov ebx, 02000200h
+ mov esi, eax
+ and ebx, eax
+ shr ebx, 4
+ and esi, 0ffe0ffe0h
+ add eax, esi
+ add eax, ebx
+ mov [edx+ebp+6], ax
+ add ebp, 2
+ jnz .oddloop
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+
+ global _vdasm_pixblt_XRGB8888_to_RGB565_MMX
+_vdasm_pixblt_XRGB8888_to_RGB565_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp-14]
+ lea ecx, [ecx+ebp*2-28]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5,[rb_mul_565]
+ movq mm6,[rb_mask5]
+ movq mm7,[g_mask6]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 14
+ jbe .odd
+
+ ;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
+ ;Application Notes.
+
+ movq mm0,[ecx+ebp*2] ;allocate 0 (0123)
+ movq mm2,mm0 ;allocate 2 (0 23)
+
+ movq mm1,[ecx+ebp*2+8] ;allocate 1 (0123)
+ movq mm3,mm1 ;allocate 3 (0123)
+ pand mm0,mm6
+ pmaddwd mm0,mm5
+ pand mm1,mm6
+ pmaddwd mm1,mm5
+ pand mm2,mm7
+ pand mm3,mm7
+ jmp .xloopstart
+
+ align 16
+.xloop:
+ movq mm0,[ecx+ebp*2] ;allocate 0 (01234)
+ por mm4,mm2 ;free 2 (01 34)
+
+ por mm3,mm1 ;free 3 (01 34)
+ pslld mm4,16-5
+
+ pslld mm3,16-5
+ movq mm2,mm0 ;allocate 2 (0 234)
+
+ movq mm1,[ecx+ebp*2+8] ;allocate 1 (01234)
+ psrad mm4,16
+
+ psrad mm3,16
+ pand mm0,mm6
+
+ packssdw mm4,mm3 ;free 3 (012 4)
+ movq mm3,mm1 ;allocate 3 (01234)
+
+ pmaddwd mm0,mm5
+ pand mm1,mm6
+
+ pmaddwd mm1,mm5
+ pand mm2,mm7
+
+ movq [edx+ebp-8],mm4 ;free 4 (0123 )
+ pand mm3,mm7
+
+.xloopstart:
+ movq mm4,[ecx+ebp*2+16] ;allocate 4 (01234)
+ por mm0,mm2 ;free 2 (01 34)
+
+ por mm1,mm3 ;free 3 (01 4)
+ pslld mm0,16-5
+
+ movq mm3,[ecx+ebp*2+24] ;allocate 3 (01 34)
+ pslld mm1,16-5
+
+ psrad mm0,16
+ movq mm2,mm4 ;allocate 2 (01234)
+
+ psrad mm1,16
+ pand mm4,mm6
+
+ packssdw mm0,mm1 ;free 1 (0 234)
+ movq mm1,mm3 ;allocate 1 (01234)
+
+ movq [edx+ebp],mm0 ;free 0 ( 1234)
+ pand mm3,mm6
+
+ pmaddwd mm4,mm5
+ add ebp,16
+
+ pmaddwd mm3,mm5
+ pand mm2,mm7
+
+ pand mm1,mm7
+ jnc .xloop
+
+ por mm4,mm2 ;free 2 (01 34)
+ por mm3,mm1 ;free 3 (01 34)
+ psllq mm4,16-5
+ psllq mm3,16-5
+ psrad mm4,16
+ psrad mm3,16
+ packssdw mm4,mm3 ;free 3 (012 4)
+ movq [edx+ebp-8],mm4 ;free 4 (0123 )
+
+.odd:
+ sub ebp, 14
+ jz .noodd
+.oddloop:
+ mov eax, [ecx+ebp*2+28]
+ mov ebx, 00f80000h
+ mov esi, eax
+ and ebx, eax
+ and eax, 000000f8h
+ shr eax, 3
+ and esi, 0000fc00h
+ shr ebx, 8
+ shr esi, 5
+ add eax, ebx
+ add eax, esi
+ mov [edx+ebp+14], ax
+ add ebp, 2
+ jnz .oddloop
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+ global _vdasm_pixblt_XRGB8888_to_RGB888_MMX
+_vdasm_pixblt_XRGB8888_to_RGB888_MMX:
+ prologue 6
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ecx,[esp+20+16]
+ lea eax,[ecx+ecx*2]
+ lea ebx,[ecx*4]
+ sub [esp+8+16],eax
+ sub [esp+16+16],ebx
+
+ pcmpeqb mm7,mm7
+ psrld mm7,8
+ movq mm6,mm7
+ psllq mm7,32 ;mm7 = high rgb mask
+ psrlq mm6,32 ;mm6 = low rgb mask
+
+ mov ebp,[esp+20+16]
+ mov edx,[esp+24+16]
+ mov eax,[esp+16+16]
+ mov ebx,[esp+ 8+16]
+.yloop:
+ mov ecx,ebp
+ shr ecx,3
+ jz .checkodd
+.xloop:
+ movq mm0,[esi] ;mm0 = a1r1g1b1a0r0g0b0
+ movq mm1,mm6
+
+ movq mm2,[esi+8] ;mm2 = a3r3g3b3a2r2g2b2
+ pand mm1,mm0 ;mm1 = ----------r0g0b0
+
+ movq mm3,mm6
+ pand mm0,mm7 ;mm0 = --r1g1b1--------
+
+ movq mm4,mm2
+ pand mm3,mm2 ;mm3 = ----------r2g2b2
+
+ psrlq mm0,8 ;mm0 = ----r1g1b1------
+ pand mm2,mm7 ;mm2 = --r3g3b3--------
+
+ movq mm5,[esi+16] ;mm5 = a5r5g5b5a4r4g4b4
+ psllq mm4,48 ;mm4 = g2b2------------
+
+ por mm0,mm1 ;mm0 = ----r1g1b1r0g0b0
+ psrlq mm3,16 ;mm3 = --------------r2
+
+ por mm0,mm4 ;mm0 = g2b2r1g1b1r0g0b0
+ movq mm1,mm6
+
+ pand mm1,mm5 ;mm1 = ----------r4g4b4
+ psrlq mm2,24 ;mm2 = --------r3g3b3--
+
+ movq [edi],mm0
+ pand mm5,mm7 ;mm5 = --r5g5b5--------
+
+ psllq mm1,32 ;mm1 = --r4g4b4--------
+ movq mm4,mm5 ;mm4 = --r5g5b5--------
+
+ por mm2,mm3 ;mm2 = --------r3g3b3r2
+ psllq mm5,24 ;mm5 = b5--------------
+
+ movq mm3,[esi+24] ;mm3 = a7r7g7b7a6r6g6b6
+ por mm2,mm1 ;mm2 = --r4g4b4r3g3b3r2
+
+ movq mm1,mm6
+ por mm2,mm5 ;mm2 = b5r4g4b4r3g3b3r2
+
+ psrlq mm4,40 ;mm4 = ------------r5g5
+ pand mm1,mm3 ;mm1 = ----------r6g6b6
+
+ psllq mm1,16 ;mm1 = ------r6g6b6----
+ pand mm3,mm7 ;mm3 = --r7g7b7--------
+
+ por mm4,mm1 ;mm4 = ------r6g6b6r5g5
+ psllq mm3,8 ;mm3 = r7g7b7----------
+
+ movq [edi+8],mm2
+ por mm4,mm3 ;mm4 = r7g7b7r6g6b6r5g5
+
+ add esi,32
+ sub ecx,1
+
+ movq [edi+16],mm4 ;mm3
+
+ lea edi,[edi+24]
+ jne .xloop
+
+.checkodd:
+ mov ecx,ebp
+ and ecx,7
+ jz .noodd
+ movd mm0,eax
+.oddloop:
+ mov eax,[esi]
+ add esi,4
+ mov [edi],ax
+ shr eax,16
+ mov [edi+2],al
+ add edi,3
+ sub ecx,1
+ jnz .oddloop
+ movd eax,mm0
+.noodd:
+ add esi,eax
+ add edi,ebx
+
+ sub edx,1
+ jne .yloop
+
+ emms
+
+ epilogue
+ ret
+
+ global _vdasm_pixblt_XRGB1555_to_XRGB8888_MMX
+_vdasm_pixblt_XRGB1555_to_XRGB8888_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp*2-12]
+ lea ecx, [ecx+ebp-6]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5, [r_mask_555]
+ movq mm6, [g_mask_555]
+ movq mm7, [b_mask_555]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 6
+ jbe .odd
+
+.xloop:
+ movq mm0, [ecx+ebp]
+ movq mm1, mm6
+ movq mm2, mm7
+ pand mm1, mm0
+ pand mm2, mm0
+ pand mm0, mm5
+
+ paddw mm0, mm0
+ pmulhw mm1, [x4200w]
+ psllq mm2, 3
+ paddw mm0, mm2
+ movq mm2, mm0
+ psrlw mm0, 5
+ pand mm0, [x07b]
+ paddw mm0, mm2
+ movq mm2, mm0
+ punpcklbw mm0, mm1
+ punpckhbw mm2, mm1
+
+ movq [edx+ebp*2], mm0
+ movq [edx+ebp*2+8], mm2
+ add ebp, 8
+ jnc .xloop
+.odd:
+ sub ebp, 6
+ jz .noodd
+.oddloop:
+ movzx eax, word [ecx+ebp+6]
+ mov ebx, 03e0h
+ mov esi, 001fh
+ and ebx, eax
+ and esi, eax
+ and eax, 07c00h
+ shl esi, 3
+ shl ebx, 6
+ shl eax, 9
+ add ebx, esi
+ add eax, ebx
+ mov ebx, eax
+ shr eax, 5
+ and eax, 070707h
+ add eax, ebx
+ mov [edx+ebp*2+12], eax
+ add ebp, 2
+ jnz .oddloop
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+
+ global _vdasm_pixblt_RGB565_to_XRGB8888_MMX
+_vdasm_pixblt_RGB565_to_XRGB8888_MMX:
+ prologue 6
+
+ mov ebp, [esp+20+16]
+ mov edi, [esp+24+16]
+ add ebp, ebp
+ mov edx, [esp+4+16]
+ mov ecx, [esp+12+16]
+ lea edx, [edx+ebp*2-12]
+ lea ecx, [ecx+ebp-6]
+ neg ebp
+ mov [esp+20+16], ebp
+
+ movq mm5, [r_mask_565]
+ movq mm6, [g_mask_565]
+ movq mm7, [b_mask_565]
+
+.yloop:
+ mov ebp, [esp+20+16]
+ add ebp, 6
+ jbe .odd
+
+.xloop:
+ movq mm0, [ecx+ebp]
+ movq mm1, mm6
+ movq mm2, mm7
+ pand mm1, mm0
+ pand mm2, mm0
+ pand mm0, mm5
+
+ pmulhw mm1, [x2080w]
+ psllq mm2, 3
+ paddw mm0, mm2
+ movq mm2, mm0
+ psrlw mm0, 5
+ pand mm0, [x07b]
+ paddw mm0, mm2
+ movq mm2, mm0
+ punpcklbw mm0, mm1
+ punpckhbw mm2, mm1
+
+ movq [edx+ebp*2], mm0
+ movq [edx+ebp*2+8], mm2
+ add ebp, 8
+ jnc .xloop
+
+.odd:
+ sub ebp, 6
+ jz .noodd
+ push edi
+.oddloop:
+ movzx eax, word [ecx+ebp+6]
+ mov ebx, 0000f800h
+ and ebx, eax
+ mov esi, eax
+ shl ebx, 8
+ mov edi, eax
+ shl eax, 3
+ and esi, 000007e0h
+ and eax, 000000f8h
+ add ebx, eax
+ shl esi, 5
+ mov eax, ebx
+ shr ebx, 5
+ and edi, 00000600h
+ shr edi, 1
+ and ebx, 00070007h
+ add esi, edi
+ add eax, ebx
+ add eax, esi
+ mov [edx+ebp*2+12], eax
+ add ebp, 2
+ jnz .oddloop
+ pop edi
+.noodd:
+ add ecx, [esp+16+16]
+ add edx, [esp+8+16]
+ dec edi
+ jne .yloop
+
+ emms
+ epilogue
+ ret
+
+
+ global _vdasm_pixblt_RGB888_to_XRGB8888_MMX
+_vdasm_pixblt_RGB888_to_XRGB8888_MMX:
+ prologue 6
+
+ mov esi,[esp+12+16]
+ mov edi,[esp+4+16]
+
+ mov ecx,[esp+20+16]
+ lea eax,[ecx+ecx*2]
+ lea ebx,[ecx*4]
+ sub [esp+8+16],ebx
+ sub [esp+16+16],eax
+
+ mov edx,[esp+24+16]
+ mov ebx,[esp+20+16]
+ mov ecx,[esp+16+16]
+ mov eax,[esp+ 8+16]
+
+ ;ebx horizontal count backup
+ ;ecx source modulo
+ ;edx vertical count
+ ;esi source
+ ;edi destination
+ ;ebp horizontal count
+
+.yloop:
+ mov ebp,ebx
+ shr ebp,3
+ jz .checkodd
+.xloop:
+ movq mm0,[esi] ;mm0: g2b2r1g1b1r0g0b0
+ movq mm1,mm0 ;
+
+ psrlq mm1,24 ;mm1: ------g2b2r1g1b1
+ movq mm2,mm0 ;
+
+ movq mm3,[esi+8] ;mm3: b5r4g4b4r3g3b3r2
+ punpckldq mm0,mm1 ;mm0: b2r1g1b1b1r0g0b0 [qword 0 ready]
+
+ movq mm4,mm3 ;mm4: b5r4g4b4r3g3b3r2
+ psllq mm3,48 ;mm3: b3r2------------
+
+ movq mm5,mm4 ;mm5: b5r4g4b4r3g3b3r2
+ psrlq mm2,16 ;mm2: ----g2b2--------
+
+ movq mm1,[esi+16] ;mm1: r7g7b7r6g6b6r5g5
+ por mm2,mm3 ;mm2: b3r2g2b2--------
+
+ movq [edi],mm0 ;
+ psllq mm4,24 ;mm4: b4r3g3b3r2------
+
+ movq mm3,mm5 ;mm3: b5r4g4b4r3g3b3r2
+ psrlq mm5,24 ;mm5: ------b5r4g4b4r3
+
+ movq mm0,mm1 ;mm0: r7g7b7r6g6b6r5g5
+ psllq mm1,40 ;mm1: b6r5g5----------
+
+ punpckhdq mm2,mm4 ;mm2: b4r3g3b3b3r2g2b2 [qword 1 ready]
+ por mm1,mm5 ;mm1: b6r5g5b5r4g4b4r3
+
+ movq mm4,mm0 ;mm4: r7g7b7r6g6b6r5g5
+ punpckhdq mm3,mm1 ;mm3: b6r5g5b5b5r4g4b4 [qword 2 ready]
+
+ movq [edi+8],mm2
+ psrlq mm0,16 ;mm0: ----r7g7b7r6g6b6
+
+ movq [edi+16],mm3
+ psrlq mm4,40 ;mm4: ----------r7g7b7
+
+ punpckldq mm0,mm4 ;mm0: --r7g7b7b7r6g6b6 [qword 3 ready]
+ add esi,24
+
+ movq [edi+24],mm0
+
+ add edi,32
+ sub ebp,1
+ jne .xloop
+
+.checkodd:
+ mov ebp,ebx
+ and ebp,7
+ jz .noodd
+ movd mm7,eax
+.oddloop:
+ mov ax,[esi]
+ mov [edi],ax
+ mov al,[esi+2]
+ mov [edi+2],al
+ add esi,3
+ add edi,4
+ sub ebp,1
+ jne .oddloop
+
+ movd eax,mm7
+.noodd:
+ add esi,ecx
+ add edi,eax
+
+ sub edx,1
+ jne .yloop
+ emms
+ epilogue
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm
new file mode 100644
index 000000000..87ff13b56
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_bltyuv2rgb_sse2.asm
@@ -0,0 +1,161 @@
+ section .rdata, rdata
+
+ align 16
+
+bytemasks dd 000000ffh, 0000ffffh, 00ffffffh
+
+ section .text
+
+;============================================================================
+
+ global _vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2
+_vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2:
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+12]
+ mov ebx, [esp+8+12]
+ mov ecx, [esp+12+12]
+ mov edx, [esp+16+12]
+ mov esi, [esp+20+12]
+ mov edi, [esp+24+12]
+
+ pcmpeqb xmm6, xmm6
+ psrlw xmm6, 8 ;xmm6 = 00FF x 8
+
+ sub esi, 4
+ js .postcheck
+.xloop:
+ movdqu xmm2, [edx] ;xmm0 = X3R3G3B3X2R2G2B2X1R1G1B1X0R0G0B0
+ add edx, 16
+ movdqa xmm5, xmm2
+ pand xmm2, xmm6 ;xmm0 = R3 B3 R2 B2 R1 B1 R0 B0
+ psrlw xmm5, 8 ;xmm1 = X3 G3 X2 G2 X1 G1 X0 G0
+ movdqa xmm0, [edi+0] ;coeff_rb_to_y
+ movdqa xmm1, [edi+16] ;coeff_rb_to_u
+ movdqa xmm3, [edi+32] ;coeff_g_to_y
+ movdqa xmm4, [edi+48] ;coeff_g_to_u
+ pmaddwd xmm0, xmm2
+ pmaddwd xmm1, xmm2
+ pmaddwd xmm2, [edi+64] ;coeff_rb_to_v
+ pmaddwd xmm3, xmm5
+ pmaddwd xmm4, xmm5
+ pmaddwd xmm5, [edi+80] ;coeff_g_to_v
+ paddd xmm0, xmm3
+ paddd xmm1, xmm4
+ paddd xmm2, xmm5
+ paddd xmm0, [edi+96] ;bias_y
+ paddd xmm1, [edi+112] ;bias_c
+ paddd xmm2, [edi+112] ;bias_c
+ psrad xmm0, 15
+ psrad xmm1, 15
+ psrad xmm2, 15
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm2, xmm2
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ packuswb xmm2, xmm2
+ movd [eax], xmm0
+ movd [ebx], xmm1
+ movd [ecx], xmm2
+ add eax, 4
+ add ebx, 4
+ add ecx, 4
+ sub esi, 4
+ jns .xloop
+.postcheck:
+ jmp dword [.finaltable + esi*4 + 16]
+.complete:
+ pop ebx
+ pop esi
+ pop edi
+ ret
+
+.finaltable:
+ dd .complete
+ dd .do1
+ dd .do2
+ dd .do3
+
+.finaltable2:
+ dd .fin1
+ dd .fin2
+ dd .fin3
+
+.do1:
+ movd xmm2, [edx]
+ jmp short .dofinal
+.do2:
+ movq xmm2, [edx]
+ jmp short .dofinal
+.do3:
+ movq xmm2, [edx]
+ movd xmm1, [edx]
+ movlhps xmm2, xmm1
+.dofinal:
+ movdqa xmm5, xmm2
+ pand xmm2, xmm6 ;xmm0 = R3 B3 R2 B2 R1 B1 R0 B0
+ psrlw xmm5, 8 ;xmm1 = X3 G3 X2 G2 X1 G1 X0 G0
+ movdqa xmm0, [edi+0] ;coeff_rb_to_y
+ movdqa xmm1, [edi+16] ;coeff_rb_to_u
+ movdqa xmm3, [edi+32] ;coeff_g_to_y
+ movdqa xmm4, [edi+48] ;coeff_g_to_u
+ pmaddwd xmm0, xmm2
+ pmaddwd xmm1, xmm2
+ pmaddwd xmm2, [edi+64] ;coeff_rb_to_v
+ pmaddwd xmm3, xmm5
+ pmaddwd xmm4, xmm5
+ pmaddwd xmm5, [edi+80] ;coeff_g_to_v
+ paddd xmm0, xmm3
+ paddd xmm1, xmm4
+ paddd xmm2, xmm5
+ paddd xmm0, [edi+96] ;bias_y
+ paddd xmm1, [edi+112] ;bias_c
+ paddd xmm2, [edi+112] ;bias_c
+ psrad xmm0, 15
+ psrad xmm1, 15
+ psrad xmm2, 15
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm2, xmm2
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ movd xmm7, [bytemasks + esi*4 + 12]
+ packuswb xmm2, xmm2
+
+ jmp dword [.finaltable2 + esi*4 + 12]
+
+.fin1:
+ movd edx, xmm0
+ mov [eax], dl
+ movd edx, xmm1
+ mov [ebx], dl
+ movd edx, xmm2
+ mov [ecx], dl
+ jmp .complete
+.fin2:
+ movd edx, xmm0
+ mov [eax], dx
+ movd edx, xmm1
+ mov [ebx], dx
+ movd edx, xmm2
+ mov [ecx], dx
+ jmp .complete
+.fin3:
+ movd edx, xmm0
+ mov [eax], dx
+ shr edx, 16
+ mov [eax+2], dl
+ movd edx, xmm1
+ mov [ebx], dx
+ shr edx, 16
+ mov [ebx+2], dl
+ movd edx, xmm2
+ mov [ecx], dx
+ shr edx, 16
+ mov [ecx+2], dl
+ jmp .complete
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm
new file mode 100644
index 000000000..912c655ab
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_mmx.asm
@@ -0,0 +1,1559 @@
+; VirtualDub - Video processing and capture application
+; Graphics support library
+; Copyright (C) 1998-2004 Avery Lee
+;
+; This program is free software; you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+;
+ section .rdata, rdata, align=16
+
+x0002000200020002 dq 0002000200020002h
+x0004000400040004 dq 0004000400040004h
+x0008000800080008 dq 0008000800080008h
+x0000200000002000 dq 0000200000002000h
+
+ align 16
+MMX_roundval dq 0000200000002000h, 0000200000002000h
+
+
+;**************************************************************************
+
+x0000FFFF0000FFFF dq 0000FFFF0000FFFFh
+x0000010100000101 dq 0000010100000101h
+x0100010001000100 dq 0100010001000100h
+
+ section .text
+
+;--------------------------------------------------------------------------
+;_vdasm_resize_interp_row_run_MMX(
+; [esp+ 4] void *dst,
+; [esp+ 8] void *src,
+; [esp+12] ulong width,
+; [esp+16] __int64 xaccum,
+; [esp+24] __int64 x_inc);
+;
+ global _vdasm_resize_interp_row_run_MMX
+_vdasm_resize_interp_row_run_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi, [esp+8+16]
+ mov edi, [esp+4+16]
+ mov ebp, [esp+12+16]
+
+ movd mm4, dword [esp+16+16]
+ pxor mm7, mm7
+ movd mm6, dword [esp+24+16]
+ punpckldq mm4, mm4
+ punpckldq mm6, mm6
+
+ shr esi, 2
+
+ mov eax, [esp+16+16]
+ mov ebx, [esp+20+16]
+ add esi, ebx
+ mov ebx, [esp+24+16]
+ mov ecx, [esp+28+16]
+
+ shl ebp,2
+ add edi,ebp
+ neg ebp
+
+.colloop:
+ movd mm1, dword [esi*4+4]
+ movq mm5, mm4
+
+ movd mm0, dword [esi*4]
+ punpcklbw mm1, mm7
+
+ punpcklbw mm0, mm7
+ psrld mm5, 24
+
+ movq mm3, [x0100010001000100]
+ packssdw mm5, mm5
+
+ pmullw mm1, mm5
+ psubw mm3, mm5
+
+ pmullw mm0, mm3
+ paddd mm4, mm6
+
+ ;stall
+ ;stall
+
+ ;stall
+ ;stall
+
+ paddw mm0, mm1
+
+ psrlw mm0, 8
+ add eax, ebx
+
+ adc esi, ecx
+ packuswb mm0, mm0
+
+ movd dword [edi+ebp],mm0
+
+ add ebp, 4
+ jnz .colloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+
+;**************************************************************************
+
+;vdasm_resize_interp_col_run_MMX(
+; [esp+ 4] void *dst,
+; [esp+ 8] void *src1,
+; [esp+12] void *src2,
+; [esp+16] ulong width,
+; [esp+20] ulong yaccum);
+
+
+ global _vdasm_resize_interp_col_run_MMX
+_vdasm_resize_interp_col_run_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov esi, [esp+8+16]
+ mov edx, [esp+12+16]
+ mov edi, [esp+4+16]
+ mov ebp, [esp+16+16]
+
+ movd mm4, dword [esp+20+16]
+ pxor mm7, mm7
+ punpcklwd mm4, mm4
+ punpckldq mm4, mm4
+ psrlw mm4, 8
+ pxor mm4, [x0000FFFF0000FFFF]
+ paddw mm4, [x0000010100000101]
+
+ shl ebp, 2
+ add edi, ebp
+ add esi, ebp
+ add edx, ebp
+ neg ebp
+
+.colloop:
+ movd mm0, dword [esi+ebp]
+ movd mm2, dword [edx+ebp]
+
+ punpcklbw mm0, mm7
+ punpcklbw mm2, mm7
+
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+
+ pmaddwd mm0, mm4
+ pmaddwd mm1, mm4
+
+ psrad mm0, 8
+ psrad mm1, 8
+
+ packssdw mm0, mm1
+ packuswb mm0, mm0
+
+ movd dword [edi+ebp],mm0
+
+ add ebp, 4
+ jnz .colloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_row_MMX(dst, src, count, xaccum, xinc, tbl);
+
+ global _vdasm_resize_ccint_row_MMX
+_vdasm_resize_ccint_row_MMX:
+ push ebx
+ push esi
+ push edi
+ push ebp
+
+ mov ebx, [esp+4+16] ;ebx = dest addr
+ mov ecx, [esp+12+16] ;ecx = count
+
+ mov ebp, [esp+20+16] ;ebp = increment
+ mov edi, ebp ;edi = increment
+ shl ebp, 16 ;ebp = fractional increment
+ mov esi, [esp+16+16] ;esi = 16:16 position
+ sar edi, 16 ;edi = integer increment
+ mov [esp+20+16], ebp ;xinc = fractional increment
+ mov ebp, esi ;ebp = 16:16 position
+ shr esi, 16 ;esi = integer position
+ shl ebp, 16 ;ebp = fraction
+ mov [esp+16+16], ebp ;xaccum = fraction
+
+ mov eax, [esp+8+16]
+
+ shr ebp, 24 ;ebp = fraction (0...255)
+ mov [esp+8+16], edi
+ shl ebp, 4 ;ebp = fraction*16
+ mov edi, ebp
+ mov ebp, [esp+4+16] ;ebp = destination
+
+ shr eax, 2
+ add eax, esi
+ shl ecx, 2 ;ecx = count*4
+ lea ebp, [ebp+ecx-4]
+ neg ecx ;ecx = -count*4
+
+ movq mm6, [x0000200000002000]
+ pxor mm7, mm7
+
+ mov edx,[esp+16+16] ;edx = fractional accumulator
+ mov esi,[esp+20+16] ;esi = fractional increment
+
+ mov ebx,[esp+24+16] ;ebx = coefficient pointer
+
+ movd mm0,dword [eax*4]
+ movd mm1,dword [eax*4+4]
+ punpcklbw mm0,mm7 ;mm0 = [a1][r1][g1][b1]
+
+ ;borrow stack pointer
+ push 0 ;don't crash
+ push dword [fs:0]
+ mov dword [fs:0], esp
+ mov esp, [esp+8+24] ;esp = integer increment
+ jmp short ccint_loop_MMX_start
+
+ ;EAX source pointer / 4
+ ;EBX coefficient pointer
+ ;ECX count
+ ;EDX fractional accumulator
+ ;ESI fractional increment
+ ;EDI coefficient offset
+ ;ESP integer increment
+ ;EBP destination pointer
+
+ align 16
+ccint_loop_MMX:
+ movd mm0,dword [eax*4]
+ packuswb mm2,mm2 ;mm0 = [a][r][g][b][a][r][g][b]
+
+ movd mm1,dword [eax*4+4]
+ punpcklbw mm0,mm7 ;mm0 = [a1][r1][g1][b1]
+
+ movd dword [ebp+ecx],mm2
+ccint_loop_MMX_start:
+ movq mm4,mm0 ;mm0 = [a1][r1][g1][b1]
+
+ movd mm2,dword [eax*4+8]
+ punpcklbw mm1,mm7 ;mm1 = [a2][r2][g2][b2]
+
+ movd mm3,dword [eax*4+12]
+ punpcklbw mm2,mm7 ;mm2 = [a3][r3][g3][b3]
+
+ punpcklbw mm3,mm7 ;mm3 = [a4][r4][g4][b4]
+ movq mm5,mm2 ;mm2 = [a3][r3][g3][b3]
+
+ add edx,esi ;add fractional increment
+ punpcklwd mm0,mm1 ;mm0 = [g2][g1][b2][b1]
+
+ pmaddwd mm0,[ebx+edi]
+ punpcklwd mm2,mm3 ;mm2 = [g4][g3][b4][b3]
+
+ pmaddwd mm2,[ebx+edi+8]
+ punpckhwd mm4,mm1 ;mm4 = [a2][a1][r2][r1]
+
+ pmaddwd mm4,[ebx+edi]
+ punpckhwd mm5,mm3 ;mm5 = [a4][a3][b4][b3]
+
+ pmaddwd mm5,[ebx+edi+8]
+ paddd mm0,mm6
+
+ adc eax,esp ;add integer increment and fractional bump to offset
+ mov edi,0ff000000h
+
+ paddd mm2,mm0 ;mm0 = [ g ][ b ]
+ paddd mm4,mm6
+
+ psrad mm2,14
+ paddd mm4,mm5 ;mm4 = [ a ][ r ]
+
+ and edi,edx
+ psrad mm4,14
+
+ shr edi,20 ;edi = fraction (0...255)*16
+ add ecx,4
+
+ packssdw mm2,mm4 ;mm0 = [ a ][ r ][ g ][ b ]
+ jnc ccint_loop_MMX
+
+ packuswb mm2,mm2 ;mm0 = [a][r][g][b][a][r][g][b]
+ movd dword [ebp],mm2
+
+ mov esp, dword [fs:0]
+ pop dword [fs:0]
+ pop eax
+
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_col_MMX(dst, src1, src2, src3, src4, count, tbl);
+
+ global _vdasm_resize_ccint_col_MMX
+_vdasm_resize_ccint_col_MMX:
+ push ebx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp, [esp+4+16] ;ebp = dest addr
+ mov esi, [esp+24+16] ;esi = count
+ add esi, esi
+ add esi, esi
+
+ mov eax, [esp+8+16] ;eax = row 1
+ mov ebx, [esp+12+16] ;ebx = row 2
+ mov ecx, [esp+16+16] ;ecx = row 3
+ mov edx, [esp+20+16] ;edx = row 4
+ mov edi, [esp+28+16] ;edi = coefficient ptr
+
+ add eax, esi
+ add ebx, esi
+ add ecx, esi
+ add edx, esi
+ add ebp, esi
+ neg esi
+
+ movq mm4,[edi]
+ movq mm5,[edi+8]
+ movq mm6,[x0000200000002000]
+ pxor mm7,mm7
+
+ movd mm2,dword [eax+esi]
+ movd mm1,dword [ebx+esi] ;mm1 = pixel1
+ punpcklbw mm2,mm7
+ jmp short ccint_col_loop_MMX.entry
+
+ align 16
+ccint_col_loop_MMX:
+ movd mm2,dword [eax+esi] ;mm2 = pixel0
+ packuswb mm0,mm0
+
+ movd mm1,dword [ebx+esi] ;mm1 = pixel1
+ pxor mm7,mm7
+
+ movd dword [ebp+esi-4],mm0
+ punpcklbw mm2,mm7
+
+ccint_col_loop_MMX.entry:
+ punpcklbw mm1,mm7
+ movq mm0,mm2
+
+ movd mm3,dword [edx+esi] ;mm3 = pixel3
+ punpcklwd mm0,mm1 ;mm0 = [g1][g0][b1][b0]
+
+ pmaddwd mm0,mm4
+ punpckhwd mm2,mm1 ;mm2 = [a1][a0][r1][r0]
+
+ movd mm1,dword [ecx+esi] ;mm1 = pixel2
+ punpcklbw mm3,mm7
+
+ pmaddwd mm2,mm4
+ punpcklbw mm1,mm7
+
+ movq mm7,mm1
+ punpcklwd mm1,mm3 ;mm1 = [g3][g2][b3][b2]
+
+ punpckhwd mm7,mm3 ;mm7 = [a3][a2][r3][r2]
+ pmaddwd mm1,mm5
+
+ pmaddwd mm7,mm5
+ paddd mm0,mm6
+
+ paddd mm2,mm6
+ paddd mm0,mm1
+
+ paddd mm2,mm7
+ psrad mm0,14
+
+ psrad mm2,14
+ add esi,4
+
+ packssdw mm0,mm2
+ jne ccint_col_loop_MMX
+
+ packuswb mm0,mm0
+ movd dword [ebp-4],mm0
+
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+;--------------------------------------------------------------------------
+;vdasm_resize_ccint_col_SSE2(dst, src1, src2, src3, src4, count, tbl);
+
+ global _vdasm_resize_ccint_col_SSE2
+_vdasm_resize_ccint_col_SSE2:
+ push ebx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp,[esp + 4 + 16] ;ebp = dest addr
+ mov esi,[esp + 24 + 16] ;esi = count
+ add esi,esi
+ add esi,esi
+
+ mov eax,[esp + 8 + 16] ;eax = row 1
+ mov ebx,[esp + 12 + 16] ;ebx = row 2
+ mov ecx,[esp + 16 + 16] ;ecx = row 3
+ mov edx,[esp + 20 + 16] ;edx = row 4
+ mov edi,[esp + 28 + 16] ;edi = coefficient ptr
+
+ neg esi
+
+ add esi,4
+ jz ccint_col_SSE2_odd
+
+ movq xmm4,qword [edi]
+ movq xmm5,qword [edi+8]
+ punpcklqdq xmm4,xmm4
+ punpcklqdq xmm5,xmm5
+ movq xmm6,[x0000200000002000]
+ punpcklqdq xmm6,xmm6
+ pxor xmm7,xmm7
+
+; jmp short ccint_col_loop_SSE2.entry
+
+; align 16
+ccint_col_loop_SSE2:
+ movq xmm0, qword [eax]
+ add eax, 8
+ movq xmm1, qword [ebx]
+ add ebx, 8
+ movq xmm2, qword [ecx]
+ add ecx, 8
+ movq xmm3, qword [edx]
+ add edx, 8
+ punpcklbw xmm0,xmm1
+ punpcklbw xmm2,xmm3
+ movdqa xmm1,xmm0
+ movdqa xmm3,xmm2
+ punpcklbw xmm0,xmm7
+ punpckhbw xmm1,xmm7
+ punpcklbw xmm2,xmm7
+ punpckhbw xmm3,xmm7
+ pmaddwd xmm0,xmm4
+ pmaddwd xmm1,xmm4
+ pmaddwd xmm2,xmm5
+ pmaddwd xmm3,xmm5
+ paddd xmm0,xmm6
+ paddd xmm1,xmm6
+ paddd xmm0,xmm2
+ paddd xmm1,xmm3
+ psrad xmm0,14
+ psrad xmm1,14
+ packssdw xmm0,xmm1
+ packuswb xmm0,xmm0
+ movdq2q mm0,xmm0
+ movntq [ebp],mm0
+ add ebp,8
+ add esi,8
+ jnc ccint_col_loop_SSE2
+ jnz ccint_col_SSE2_noodd
+ccint_col_SSE2_odd:
+ movd mm0, dword [eax]
+ pxor mm7,mm7
+ movd mm1, dword [ebx]
+ movdq2q mm4,xmm4
+ movd mm2, dword [ecx]
+ movdq2q mm5,xmm5
+ movd mm3, dword [edx]
+ movdq2q mm6,xmm6
+ punpcklbw mm0,mm1
+ punpcklbw mm2,mm3
+ movq mm1,mm0
+ movq mm3,mm2
+ punpcklbw mm0,mm7
+ punpckhbw mm1,mm7
+ punpcklbw mm2,mm7
+ punpckhbw mm3,mm7
+ pmaddwd mm0,mm4
+ pmaddwd mm1,mm4
+ pmaddwd mm2,mm5
+ pmaddwd mm3,mm5
+ paddd mm0,mm6
+ paddd mm2,mm6
+ paddd mm0,mm2
+ paddd mm1,mm3
+ psrad mm0,14
+ psrad mm1,14
+ packssdw mm0,mm1
+ packuswb mm0,mm0
+ movd eax,mm0
+ movnti [ebp],eax
+
+ccint_col_SSE2_noodd:
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+
+
+;-------------------------------------------------------------------------
+;
+; long resize_table_row_MMX(Pixel *out, Pixel *in, int *filter, int filter_width, PixDim w, long accum, long frac);
+
+ .code
+
+ global _vdasm_resize_table_row_MMX
+_vdasm_resize_table_row_MMX:
+ push ebp
+ push esi
+ push edi
+ push ebx
+
+ cmp dword [esp+16+16], 4
+ jz .accel_4coeff
+ cmp dword [esp+16+16], 6
+ jz .accel_6coeff
+ cmp dword [esp+16+16], 8
+ jz .accel_8coeff
+
+ mov eax,[esp + 24 + 16]
+ mov ebp,[esp + 20 + 16]
+ mov ebx,[esp + 8 + 16]
+ mov edi,[esp + 4 + 16]
+
+ mov esi,eax
+ mov edx,eax
+
+ pxor mm5,mm5
+
+ mov ecx,[esp + 16 + 16]
+ shr ecx,1
+ mov [esp+16+16],ecx
+ test ecx,1
+ jnz .pixelloop_odd_pairs
+
+.pixelloop_even_pairs:
+ shr esi,14
+ and edx,0000ff00h
+ and esi,byte -4
+
+ mov ecx,[esp + 16 + 16]
+ shr edx,5
+ add esi,ebx
+ imul edx,ecx
+ add eax,[esp + 28 + 16]
+ add edx,[esp + 12 + 16]
+
+ movq mm6,[MMX_roundval]
+ pxor mm3,mm3
+ movq mm7,mm6
+ pxor mm2,mm2
+
+.coeffloop_unaligned_even_pairs:
+ movd mm0,dword [esi+0]
+ paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
+
+ punpcklbw mm0,[esi+4] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+ paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
+
+ movd mm2,dword [esi+8]
+ movq mm1,mm0 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm7,mm0 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm6,mm1 ;accumulate green/blue (pixels 0/1)
+ add edx,16
+
+ add esi,16
+ sub ecx,2
+
+ jne .coeffloop_unaligned_even_pairs
+
+ paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
+ paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
+
+ psrad mm7,14
+ psrad mm6,14
+
+ packssdw mm6,mm7
+ add edi,4
+
+ packuswb mm6,mm6
+ sub ebp,1
+
+ mov esi,eax
+ mov edx,eax
+
+ movd dword [edi-4],mm6
+ jne .pixelloop_even_pairs
+
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+
+ ret
+
+;----------------------------------------------------------------
+
+.pixelloop_odd_pairs:
+ shr esi,14
+ and edx,0000ff00h
+ and esi,byte -4
+
+ mov ecx,[esp + 16 + 16]
+ shr edx,5
+ add esi,ebx
+ imul edx,ecx
+ add eax,[esp + 28 + 16]
+ sub ecx,1
+ add edx,[esp + 12 + 16]
+
+ movq mm6,[MMX_roundval]
+ pxor mm3,mm3
+ pxor mm2,mm2
+ movq mm7,mm6
+
+.coeffloop_unaligned_odd_pairs:
+ movd mm0,dword [esi+0]
+ paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
+
+ punpcklbw mm0,[esi+4] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+ paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
+
+ movd mm2,dword [esi+8]
+ movq mm1,mm0 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm7,mm0 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm6,mm1 ;accumulate green/blue (pixels 0/1)
+ add edx,16
+
+ add esi,16
+ sub ecx,2
+
+ jne .coeffloop_unaligned_odd_pairs
+
+ paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
+ paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
+
+ ;finish up odd pair
+
+ movd mm0,dword [esi] ;mm0 = [x1][r1][g1][b1]
+ punpcklbw mm0,[esi+4] ;mm2 = [x0][x1][r0][r1][g0][g1][b0][b1]
+ movq mm1,mm0
+ punpcklbw mm0,mm5 ;mm0 = [g0][g1][b0][b1]
+ punpckhbw mm1,mm5 ;mm1 = [x0][x1][r0][r1]
+
+ pmaddwd mm0,[edx]
+ pmaddwd mm1,[edx]
+
+ paddd mm6,mm0
+ paddd mm7,mm1
+
+ ;combine into pixel
+
+ psrad mm6,14
+
+ psrad mm7,14
+
+ packssdw mm6,mm7
+ add edi,4
+
+ packuswb mm6,mm6
+ sub ebp,1
+
+ mov esi,eax
+ mov edx,eax
+
+ movd dword [edi-4],mm6
+ jne .pixelloop_odd_pairs
+
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+
+ ret
+
+;----------------------------------------------------------------
+
+.accel_4coeff:
+ mov eax,[esp + 24 + 16]
+ mov ebp,[esp + 20 + 16]
+ add ebp,ebp
+ add ebp,ebp
+ mov ebx,[esp + 8 + 16]
+ mov edi,[esp + 4 + 16]
+ add edi,ebp
+ neg ebp
+
+ mov esi,eax
+ mov edx,eax
+
+ movq mm4,[MMX_roundval]
+ pxor mm5,mm5
+
+ mov ecx,[esp+12+16]
+
+.pixelloop_4coeff:
+ shr esi,14
+ and edx,0000ff00h
+ and esi,byte -4
+
+ shr edx,4
+ add esi,ebx
+ add eax,[esp+28+16]
+ add edx,ecx
+
+ movd mm0,dword [esi+0]
+ movd mm2,dword [esi+8]
+ punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+ pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+ punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
+
+ paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
+ paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
+
+ psrad mm0,14
+ psrad mm1,14
+
+ packssdw mm1,mm0
+ mov esi,eax
+
+ packuswb mm1,mm1
+ mov edx,eax
+
+ movd dword [edi+ebp],mm1
+ add ebp,4
+ jne .pixelloop_4coeff
+
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+
+ ret
+
+
+;----------------------------------------------------------------
+
+.accel_6coeff:
+ mov eax,[esp + 24 + 16]
+ mov ebp,[esp + 20 + 16]
+ add ebp,ebp
+ add ebp,ebp
+ mov ebx,[esp + 8 + 16]
+ mov edi,[esp + 4 + 16]
+ add edi,ebp
+ neg ebp
+
+ mov esi,eax
+ mov edx,eax
+
+ movq mm4,[MMX_roundval]
+ pxor mm5,mm5
+
+ mov ecx,[esp+12+16]
+
+.pixelloop_6coeff:
+ shr esi,14
+ and edx,0000ff00h
+ and esi,byte -4
+
+ shr edx,5
+ lea edx,[edx+edx*2]
+ add esi,ebx
+ add eax,[esp+28+16]
+ add edx,ecx
+
+ movd mm0,dword [esi+0]
+ movd mm2,dword [esi+8]
+ punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+ pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+ punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
+
+ paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
+ paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
+
+ movd mm6,dword [esi+16]
+
+ punpcklbw mm6,[esi+20] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ movq mm7,mm6 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpckhbw mm6,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+ pmaddwd mm6,[edx+16] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm7,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm7,[edx+16] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ paddd mm0,mm6 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm7 ;accumulate green/blue (pixels 0/1)
+
+
+ psrad mm0,14
+ psrad mm1,14
+
+ packssdw mm1,mm0
+ mov esi,eax
+
+ packuswb mm1,mm1
+ mov edx,eax
+
+ movd dword [edi+ebp],mm1
+ add ebp,4
+ jne .pixelloop_6coeff
+
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+
+ ret
+
+;----------------------------------------------------------------
+
+.accel_8coeff:
+ mov eax,[esp + 24 + 16]
+ mov ebp,[esp + 20 + 16]
+ add ebp,ebp
+ add ebp,ebp
+ mov ebx,[esp + 8 + 16]
+ mov edi,[esp + 4 + 16]
+ add edi,ebp
+ neg ebp
+
+ mov esi,eax
+ mov edx,eax
+
+ movq mm4,[MMX_roundval]
+ pxor mm5,mm5
+
+ mov ecx,[esp+12+16]
+
+.pixelloop_8coeff:
+ shr esi,14
+ and edx,0000ff00h
+ and esi,byte -4
+
+ shr edx,3
+ add esi,ebx
+ add eax,[esp+28+16]
+ add edx,ecx
+
+ movd mm0,dword [esi+0]
+ movd mm2,dword [esi+8]
+ punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+
+ pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+ punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
+
+ paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
+ paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
+
+
+ movd mm6,dword [esi+16]
+
+ punpcklbw mm6,[esi+20] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ movd mm2,dword [esi+24]
+
+ punpcklbw mm2,[esi+28] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
+ movq mm7,mm6 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
+
+ punpckhbw mm6,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
+ movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
+
+ pmaddwd mm6,[edx+16] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
+ punpcklbw mm7,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
+
+ pmaddwd mm7,[edx+16] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
+ punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
+
+ pmaddwd mm2,[edx+24] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
+ punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
+
+ pmaddwd mm3,[edx+24] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
+ paddd mm0,mm6 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm7 ;accumulate green/blue (pixels 0/1)
+ paddd mm0,mm2 ;accumulate alpha/red (pixels 0/1)
+
+ paddd mm1,mm3 ;accumulate green/blue (pixels 0/1)
+
+
+ psrad mm0,14
+ psrad mm1,14
+
+ packssdw mm1,mm0
+ mov esi,eax
+
+ packuswb mm1,mm1
+ mov edx,eax
+
+ movd dword [edi+ebp],mm1
+ add ebp,4
+ jne .pixelloop_8coeff
+
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+
+ ret
+
+
+
+
+
+
+
+;-------------------------------------------------------------------------
+;
+; long resize_table_col_MMX(Pixel *out, Pixel **in_table, int *filter, int filter_width, PixDim w, long frac);
+
+ global _vdasm_resize_table_col_MMX
+_vdasm_resize_table_col_MMX:
+ push ebp
+ push esi
+ push edi
+ push ebx
+
+ mov edx,[esp + 12 + 16]
+ mov eax,[esp + 24 + 16]
+ shl eax,2
+ imul eax,[esp + 16 + 16]
+ add edx,eax
+ mov [esp + 12 + 16], edx ;[esp+12+28] = filter pointer
+
+ mov ebp,[esp + 20 + 16] ;ebp = pixel counter
+ mov edi,[esp + 4 + 16] ;edi = destination pointer
+
+ pxor mm5,mm5
+
+ cmp dword [esp+16+16], 4
+ jz .accel_4coeff
+ cmp dword [esp+16+16], 6
+ jz .accel_6coeff
+
+ mov ecx,[esp + 16 + 16]
+ shr ecx,1
+ mov [esp + 16 + 16],ecx ;ecx = filter pair count
+
+ xor ebx,ebx ;ebx = source offset
+
+ mov ecx,[esp + 16 + 16] ;ecx = filter width counter
+.pixelloop:
+ mov eax,[esp + 8 + 16] ;esi = row pointer table
+ movq mm6,[MMX_roundval]
+ movq mm7,mm6
+ pxor mm0,mm0
+ pxor mm1,mm1
+.coeffloop:
+ mov esi,[eax]
+ paddd mm6,mm0
+
+ movd mm0,dword [esi+ebx] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+ paddd mm7,mm1
+
+ mov esi,[eax+4]
+ add eax,8
+
+ movd mm1,dword [esi+ebx] ;mm1 = [0][0][0][0][x1][r1][g1][b1]
+ punpcklbw mm0,mm1 ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+ movq mm1,mm0
+ punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
+
+ pmaddwd mm0,[edx]
+ punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
+
+ pmaddwd mm1,[edx]
+ add edx,8
+
+ sub ecx,1
+ jne .coeffloop
+
+ paddd mm6,mm0
+ paddd mm7,mm1
+
+ psrad mm6,14
+ psrad mm7,14
+ add edi,4
+ packssdw mm6,mm7
+ add ebx,4
+ packuswb mm6,mm6
+ sub ebp,1
+
+ mov ecx,[esp + 16 + 16] ;ecx = filter width counter
+ mov edx,[esp + 12 + 16] ;edx = filter bank pointer
+
+ movd dword [edi-4],mm6
+ jne .pixelloop
+
+.xit:
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+ ret
+
+
+
+.accel_4coeff:
+ movq mm2,[edx]
+ movq mm3,[edx+8]
+
+ mov esi,[esp+8+16] ;esi = row pointer table
+ mov eax,[esi]
+ add ebp,ebp
+ mov ebx,[esi+4]
+ add ebp,ebp
+ mov ecx,[esi+8]
+ mov esi,[esi+12]
+ add eax,ebp
+ add ebx,ebp
+ add ecx,ebp
+ add esi,ebp
+ add edi,ebp
+ neg ebp
+
+ ;EAX source 0
+ ;EBX source 1
+ ;ECX source 2
+ ;ESI source 3
+ ;EDI destination
+ ;EBP counter
+
+ movq mm4,[MMX_roundval]
+
+.pixelloop4:
+ movd mm6,dword [eax+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+
+ punpcklbw mm6,[ebx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+ movq mm7,mm6
+ punpcklbw mm6,mm5 ;mm0 = [g1][g0][b1][b0]
+
+ pmaddwd mm6,mm2
+ punpckhbw mm7,mm5 ;mm1 = [x1][x0][r1][r0]
+
+ movd mm0,dword [ecx+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+ pmaddwd mm7,mm2
+
+ punpcklbw mm0,[esi+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+ paddd mm6,mm4
+
+ movq mm1,mm0
+ punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
+
+ pmaddwd mm0,mm3
+ punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
+
+ pmaddwd mm1,mm3
+ paddd mm7,mm4
+
+ paddd mm6,mm0
+ paddd mm7,mm1
+
+ psrad mm6,14
+ psrad mm7,14
+ packssdw mm6,mm7
+ packuswb mm6,mm6
+
+ movd dword [edi+ebp],mm6
+
+ add ebp,4
+ jne .pixelloop4
+ jmp .xit
+
+.accel_6coeff:
+ movq mm2,[edx]
+ movq mm3,[edx+8]
+ movq mm4,[edx+16]
+
+ push 0
+ push dword [fs:0]
+ mov dword [fs:0],esp
+
+ mov esp,[esp+8+24] ;esp = row pointer table
+ mov eax,[esp]
+ add ebp,ebp
+ mov ebx,[esp+4]
+ add ebp,ebp
+ mov ecx,[esp+8]
+ mov edx,[esp+12]
+ mov esi,[esp+16]
+ mov esp,[esp+20]
+ add eax,ebp
+ add ebx,ebp
+ add ecx,ebp
+ add edx,ebp
+ add esi,ebp
+ add edi,ebp
+ add esp,ebp
+ neg ebp
+
+ ;EAX source 0
+ ;EBX source 1
+ ;ECX source 2
+ ;EDX source 3
+ ;ESI source 4
+ ;EDI destination
+ ;ESP source 5
+ ;EBP counter
+
+.pixelloop6:
+ movd mm6,dword [eax+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+
+ punpcklbw mm6,[ebx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+
+ movq mm7,mm6
+ punpcklbw mm6,mm5 ;mm0 = [g1][g0][b1][b0]
+
+ movd mm0,dword [ecx+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+ punpckhbw mm7,mm5 ;mm1 = [x1][x0][r1][r0]
+
+ punpcklbw mm0,[edx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+ pmaddwd mm6,mm2
+
+ movq mm1,mm0
+ punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
+
+ pmaddwd mm7,mm2
+ punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
+
+ paddd mm6,[MMX_roundval]
+ pmaddwd mm0,mm3
+
+ paddd mm7,[MMX_roundval]
+ pmaddwd mm1,mm3
+
+ paddd mm6,mm0
+
+ movd mm0,dword [esi+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
+ paddd mm7,mm1
+
+ punpcklbw mm0,[esp+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
+ movq mm1,mm0
+ punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
+ punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
+ pmaddwd mm0,mm4
+ pmaddwd mm1,mm4
+ paddd mm6,mm0
+ paddd mm7,mm1
+
+ psrad mm6,14
+ psrad mm7,14
+ packssdw mm6,mm7
+ packuswb mm6,mm6
+
+ movd dword [edi+ebp],mm6
+
+ add ebp,4
+ jne .pixelloop6
+
+ mov esp, dword [fs:0]
+ pop dword [fs:0]
+ pop eax
+
+ jmp .xit
+
+
+ global _vdasm_resize_table_col_SSE2
+_vdasm_resize_table_col_SSE2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+
+ mov edx,[esp+12+16]
+ mov eax,[esp+24+16]
+ shl eax,2
+ imul eax,[esp+16+16]
+ add edx,eax
+ mov [esp+12+16], edx ;[esp+12+16] = filter pointer
+
+ mov ebp,[esp+20+16] ;ebp = pixel counter
+ mov edi,[esp+4+16] ;edi = destination pointer
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [MMX_roundval]
+
+ cmp dword [esp+16+16], 4
+ jz .accel_4coeff
+ cmp dword [esp+16+16], 6
+ jz .accel_6coeff
+
+ mov ecx,[esp+16+16]
+ shr ecx,1
+ mov [esp+16+16],ecx ;ecx = filter pair count
+
+ xor ebx,ebx ;ebx = source offset
+
+ mov ecx,[esp+16+16] ;ecx = filter width counter
+.pixelloop:
+ mov eax, [esp+8+16] ;esi = row pointer table
+ movdqa xmm4, xmm6
+.coeffloop:
+ mov esi,[eax]
+
+ movd xmm0, dword [esi+ebx]
+
+ mov esi,[eax+4]
+ add eax,8
+
+ movd xmm1, dword [esi+ebx]
+ punpcklbw xmm0, xmm1
+
+ punpcklbw xmm0, xmm7
+
+ movq xmm2, qword [edx]
+ pshufd xmm2, xmm2, 01000100b
+
+ pmaddwd xmm0, xmm2
+
+ paddd xmm4, xmm0
+
+ add edx,8
+
+ sub ecx,1
+ jne .coeffloop
+
+ psrad xmm4,14
+ add edi,4
+ packssdw xmm4,xmm4
+ add ebx,4
+ packuswb xmm4,xmm4
+ sub ebp,1
+
+ mov ecx,[esp+16+16] ;ecx = filter width counter
+ mov edx,[esp+12+16] ;edx = filter bank pointer
+
+ movd dword [edi-4],xmm4
+ jne .pixelloop
+
+.xit:
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+ ret
+
+.accel_4coeff:
+ shl ebp, 2
+ mov eax, [esp+8+16] ;eax = row pointer table
+ mov esi, [eax+12]
+ mov ecx, [eax+8]
+ mov ebx, [eax+4]
+ mov eax, [eax]
+ lea edi, [edi+ebp-4]
+ neg ebp
+
+ ;registers:
+ ;
+ ;EAX source 0
+ ;EBX source 1
+ ;ECX source 2
+ ;ESI source 3
+ ;EDI destination
+ ;EBP counter
+ ;
+ movq xmm4, qword [edx] ;xmm4 = coeff 0/1
+ movq xmm5, qword [edx+8] ;xmm5 = coeff 2/3
+ punpcklqdq xmm4, xmm4
+ punpcklqdq xmm5, xmm5
+
+ add ebp, 4
+ jz .oddpixel_4coeff
+
+.pixelloop_4coeff_dualpel:
+ movq xmm0, qword [eax]
+ movq xmm1, qword [ebx]
+ movq xmm2, qword [ecx]
+ movq xmm3, qword [esi]
+ add eax,8
+ add ebx,8
+ add ecx,8
+ add esi,8
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpckhbw xmm3, xmm7
+ pmaddwd xmm0, xmm4
+ pmaddwd xmm1, xmm4
+ pmaddwd xmm2, xmm5
+ pmaddwd xmm3, xmm5
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+ paddd xmm0, xmm6
+ paddd xmm1, xmm6
+ psrad xmm0, 14
+ psrad xmm1, 14
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword [edi+ebp],xmm0
+ add ebp, 8
+ jae .pixelloop_4coeff_dualpel
+ jnz .xit
+
+.oddpixel_4coeff:
+ movd xmm0, dword [eax]
+ movd xmm1, dword [ebx]
+ movd xmm2, dword [ecx]
+ movd xmm3, dword [esi]
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm2, xmm7
+ pmaddwd xmm0, xmm4
+ pmaddwd xmm2, xmm5
+ paddd xmm0, xmm2
+ paddd xmm0, xmm6
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword [edi],xmm0
+ jmp .xit
+
+
+.accel_6coeff:
+ movq xmm4, qword [edx] ;xmm4 = coeff 0/1
+ movq xmm5, qword [edx+8] ;xmm5 = coeff 2/3
+ movq xmm6, qword [edx+16] ;xmm5 = coeff 4/5
+ punpcklqdq xmm4, xmm4
+ punpcklqdq xmm5, xmm5
+ punpcklqdq xmm6, xmm6
+
+ push 0
+ push dword [fs:0]
+ mov dword [fs:0],esp
+
+ shl ebp, 2
+ mov eax, [esp+8+24] ;eax = row pointer table
+ mov esp, [eax+20]
+ mov esi, [eax+16]
+ mov edx, [eax+12]
+ mov ecx, [eax+8]
+ mov ebx, [eax+4]
+ mov eax, [eax]
+ lea edi, [edi+ebp-4]
+ neg ebp
+
+ ;registers:
+ ;
+ ;EAX source 0
+ ;EBX source 1
+ ;ECX source 2
+ ;EDX source 3
+ ;ESI source 4
+ ;EDI destination
+ ;ESP source 5
+ ;EBP counter
+ ;
+
+ add ebp, 4
+ jz .oddpixel_6coeff
+
+.pixelloop_6coeff_dualpel:
+ movq xmm0, qword [eax]
+ movq xmm1, qword [ebx]
+ movq xmm2, qword [ecx]
+ movq xmm3, qword [edx]
+ add eax,8
+ add ebx,8
+ add ecx,8
+ add edx,8
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpckhbw xmm3, xmm7
+ pmaddwd xmm0, xmm4
+ pmaddwd xmm1, xmm4
+ pmaddwd xmm2, xmm5
+ pmaddwd xmm3, xmm5
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+
+ movq xmm2, qword [esi]
+ movq xmm3, qword [esp]
+ add esi, 8
+ add esp, 8
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7
+ punpckhbw xmm3, xmm7
+ pmaddwd xmm2, xmm6
+ pmaddwd xmm3, xmm6
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+ paddd xmm0, [MMX_roundval]
+ paddd xmm1, [MMX_roundval]
+ psrad xmm0, 14
+ psrad xmm1, 14
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword [edi+ebp],xmm0
+ add ebp, 8
+ jae .pixelloop_6coeff_dualpel
+ jnz .xit_6coeff
+
+.oddpixel_6coeff:
+ movd xmm0, dword [eax]
+ movd xmm1, dword [ebx]
+ movd xmm2, dword [ecx]
+ movd xmm3, dword [edx]
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ movd xmm1, dword [esi]
+ movd xmm3, dword [esp]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm2, xmm7
+ pmaddwd xmm0, xmm4
+ punpcklbw xmm1, xmm3
+ pmaddwd xmm2, xmm5
+ punpcklbw xmm1, xmm7
+ pmaddwd xmm1, xmm6
+ paddd xmm0, xmm2
+ paddd xmm1, [MMX_roundval]
+ paddd xmm0, xmm1
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword [edi],xmm0
+
+.xit_6coeff:
+ mov esp, dword [fs:0]
+ pop dword [fs:0]
+ pop eax
+ jmp .xit
+
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm
new file mode 100644
index 000000000..cf7332cb2
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_resample_sse41.asm
@@ -0,0 +1,358 @@
+ segment .rdata, align=16
+
+round dq 0000000000002000h
+colround dq 0000200000002000h
+
+ segment .text
+
+ global _vdasm_resize_table_row_8_k8_4x_SSE41
+_vdasm_resize_table_row_8_k8_4x_SSE41:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ movq xmm6, [round]
+ pshufd xmm6, xmm6, 0
+
+ mov ebp, [esp + 4 + 16] ;ebp = dst
+ mov esi, [esp + 12 + 16] ;esi = width
+ mov edi, [esp + 16 + 16] ;edi = kernel
+.yloop:
+ ;eax = temp
+ ;ebx = temp
+ ;ecx = temp
+ ;edx = temp
+ ;esi = horiz counter
+ ;edi = filter list
+ ;ebp = destination
+
+ mov eax, [edi+0]
+ mov ebx, [edi+4]
+ mov ecx, [edi+8]
+ mov edx, [esp+8+16]
+ add eax, edx
+ add ebx, edx
+ add ecx, edx
+ add edx, [edi+12]
+
+ pmovzxbw xmm0, [eax]
+ pmaddwd xmm0, [edi+10h]
+ pmovzxbw xmm1, [ebx]
+ pmaddwd xmm1, [edi+20h]
+ pmovzxbw xmm2, [ecx]
+ pmaddwd xmm2, [edi+30h]
+ pmovzxbw xmm3, [edx]
+ pmaddwd xmm3, [edi+40h]
+ add edi, 50h
+ phaddd xmm0, xmm1
+ phaddd xmm2, xmm3
+ phaddd xmm0, xmm2
+ paddd xmm0, xmm6
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd [ebp], xmm0
+
+ add ebp, 4
+ sub esi, 1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_resize_table_row_8_k16_4x_SSE41
+_vdasm_resize_table_row_8_k16_4x_SSE41:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ movq xmm6, [round]
+ pshufd xmm6, xmm6, 0
+
+ mov ebp, [esp + 4 + 16] ;ebp = dst
+ mov esi, [esp + 12 + 16] ;esi = width
+ mov edi, [esp + 16 + 16] ;edi = kernel
+.yloop:
+ ;eax = temp
+ ;ebx = temp
+ ;ecx = temp
+ ;edx = temp
+ ;esi = horiz counter
+ ;edi = filter list
+ ;ebp = destination
+
+ mov eax, [edi+0]
+ mov ebx, [edi+4]
+ mov ecx, [edi+8]
+ mov edx, [esp+8+16]
+ add eax, edx
+ add ebx, edx
+ add ecx, edx
+ add edx, [edi+12]
+
+ pmovzxbw xmm0, [eax]
+ pmaddwd xmm0, [edi+10h]
+ pmovzxbw xmm1, [ebx]
+ pmaddwd xmm1, [edi+20h]
+ pmovzxbw xmm2, [ecx]
+ pmaddwd xmm2, [edi+30h]
+ pmovzxbw xmm3, [edx]
+ pmaddwd xmm3, [edi+40h]
+ pmovzxbw xmm4, [eax+8]
+ pmaddwd xmm4, [edi+50h]
+ pmovzxbw xmm5, [ebx+8]
+ pmaddwd xmm5, [edi+60h]
+ paddd xmm0, xmm4
+ pmovzxbw xmm4, [ecx+8]
+ pmaddwd xmm4, [edi+70h]
+ paddd xmm1, xmm5
+ pmovzxbw xmm5, [edx+8]
+ pmaddwd xmm5, [edi+80h]
+ paddd xmm2, xmm4
+ paddd xmm3, xmm5
+ add edi, 90h
+ phaddd xmm0, xmm1
+ phaddd xmm2, xmm3
+ phaddd xmm0, xmm2
+ paddd xmm0, xmm6
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd [ebp], xmm0
+
+ add ebp, 4
+ sub esi, 1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_resize_table_row_8_SSE41
+_vdasm_resize_table_row_8_SSE41:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor xmm7, xmm7
+ movq xmm6, [round]
+
+ mov edi, [esp + 4 + 16] ;edi = dst
+ mov ebx, [esp + 8 + 16] ;ebx = src
+ mov ebp, [esp + 12 + 16] ;ebp = width
+ mov edx, [esp + 16 + 16] ;edx = kernel
+.yloop:
+ ;eax = temp
+ ;ebx = source base address
+ ;ecx = (temp) source
+ ;edx = filter list
+ ;esi = (temp) kernel width
+ ;edi = destination
+ ;ebp = horiz counter
+
+ mov eax, [edx]
+ add edx, 16
+ lea ecx, [ebx + eax]
+ mov esi, [esp + 20 + 16] ;esi = kernel width
+
+ movq xmm2, xmm6
+.xloop:
+ pmovzxbw xmm0, [ecx]
+ add ecx, 8
+ pmaddwd xmm0, [edx]
+ paddd xmm2, xmm0
+ add edx, 16
+ sub esi, 8
+ jne .xloop
+
+ phaddd xmm2, xmm2
+ phaddd xmm2, xmm2
+ psrad xmm2, 14
+ packssdw xmm2, xmm2
+ packuswb xmm2, xmm2
+ movd eax, xmm2
+ mov [edi], al
+ add edi, 1
+ sub ebp, 1
+ jne .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_resize_table_col_8_k2_SSE41
+_vdasm_resize_table_col_8_k2_SSE41:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ movq xmm6, [colround]
+ pshufd xmm6, xmm6, 0
+
+ mov esi, [esp + 4 + 16] ;esi = dst
+ mov edi, [esp + 16 + 16] ;edi = kernel
+ mov ebp, [esp + 12 + 16] ;ebp = width
+
+ movq xmm7, [edi]
+ pshufd xmm7, xmm7, 0
+
+ mov edx, [esp + 8 + 16] ;ebx = srcs
+ mov eax, [edx+0]
+ mov ebx, [edx+4]
+ add eax, ebp
+ add ebx, ebp
+ neg ebp
+
+.yloop:
+ ;eax = row0
+ ;ebx = row1
+ ;ecx =
+ ;edx =
+ ;edi = kernel
+ ;esi = dest
+ ;ebp = width counter
+
+ movd xmm0, [eax+ebp]
+ movd xmm2, [ebx+ebp]
+ punpcklbw xmm0, xmm2
+ pmovzxbw xmm0, xmm0
+ pmaddwd xmm0, xmm7
+
+ paddd xmm0, xmm6
+
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd [esi], xmm0
+ add esi, 4
+ add ebp, 4
+ jnz .yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_resize_table_col_8_k4_SSE41
+_vdasm_resize_table_col_8_k4_SSE41:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ movq xmm7, [colround]
+ pshufd xmm7, xmm7, 0
+
+ mov esi, [esp + 4 + 16] ;esi = dst
+ mov edi, [esp + 16 + 16] ;edi = kernel
+
+ movdqu xmm6, [edi]
+ pshufd xmm5, xmm6, 0
+ pshufd xmm6, xmm6, 0aah
+
+ mov edx, [esp + 8 + 16] ;ebx = srcs
+ mov ebp, [esp + 12 + 16]
+ mov eax, [edx+0]
+ mov ebx, [edx+4]
+ mov ecx, [edx+8]
+ mov edx, [edx+12]
+ lea eax, [eax+ebp-4]
+ lea ebx, [ebx+ebp-4]
+ lea ecx, [ecx+ebp-4]
+ lea edx, [edx+ebp-4]
+ lea esi, [esi+ebp-4]
+ neg ebp
+ add ebp,4
+ jz .odd
+.yloop:
+ ;eax = row0
+ ;ebx = row1
+ ;ecx = row2
+ ;edx = row3
+ ;edi = kernel
+ ;esi = dest
+ ;ebp = width counter
+
+ movd xmm0, [eax+ebp]
+ movd xmm1, [ebx+ebp]
+ punpcklbw xmm0, xmm1
+
+ movd xmm1, [ecx+ebp]
+ movd xmm2, [edx+ebp]
+ punpcklbw xmm1, xmm2
+
+ movd xmm2, [eax+ebp+4]
+ movd xmm3, [ebx+ebp+4]
+ punpcklbw xmm2, xmm3
+
+ movd xmm3, [ecx+ebp+4]
+ movd xmm4, [edx+ebp+4]
+ punpcklbw xmm3, xmm4
+
+ pmovzxbw xmm0, xmm0
+ pmaddwd xmm0, xmm5
+
+ pmovzxbw xmm1, xmm1
+ pmaddwd xmm1, xmm6
+
+ pmovzxbw xmm2, xmm2
+ pmaddwd xmm2, xmm5
+
+ pmovzxbw xmm3, xmm3
+ pmaddwd xmm3, xmm6
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+
+ paddd xmm0, xmm7
+ paddd xmm2, xmm7
+
+ psrad xmm0, 14
+ psrad xmm2, 14
+
+ packssdw xmm0, xmm2
+ packuswb xmm0, xmm0
+ movq [esi+ebp], xmm0
+ add ebp, 8
+ js .yloop
+ jnz .noodd
+
+.odd:
+ movd xmm0, [eax]
+ movd xmm1, [ebx]
+ movd xmm2, [ecx]
+ movd xmm3, [edx]
+ punpcklbw xmm0, xmm1
+ punpcklbw xmm2, xmm3
+ pmovzxbw xmm0, xmm0
+ pmovzxbw xmm2, xmm2
+ pmaddwd xmm0, xmm5
+ pmaddwd xmm2, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm7
+ psrad xmm0, 14
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd [esi], xmm0
+.noodd:
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm
new file mode 100644
index 000000000..3fe7cedbc
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_spanutils_isse.asm
@@ -0,0 +1,193 @@
+ section .rdata, rdata, align=16
+
+xfefefefefefefefe dq 0fefefefefefefefeh
+xe0e0e0e0e0e0e0e0 dq 0e0e0e0e0e0e0e0e0h
+x0002000200020002 dq 00002000200020002h
+
+ section .text
+
+;==============================================================================
+ global _vdasm_horiz_expand2x_coaligned_ISSE
+_vdasm_horiz_expand2x_coaligned_ISSE:
+ mov ecx, [esp+8]
+ mov edx, [esp+4]
+ mov eax, [esp+12]
+.xloop:
+ movq mm0, [ecx]
+ movq mm1, mm0
+ pavgb mm0, [ecx+1]
+ movq mm2, mm1
+ punpcklbw mm1, mm0
+ punpckhbw mm2, mm0
+
+ movq [edx], mm1
+ movq [edx+8], mm2
+ add edx, 16
+ add ecx, 8
+
+ sub eax, 16
+ jne .xloop
+ ret
+
+;==============================================================================
+ global _vdasm_vert_average_13_ISSE
+_vdasm_vert_average_13_ISSE:
+ push ebx
+ mov ebx, [esp+12+4]
+ mov ecx, [esp+8+4]
+ mov edx, [esp+4+4]
+ mov eax, [esp+16+4]
+
+ add ebx, eax
+ add ecx, eax
+ add edx, eax
+ neg eax
+
+ pcmpeqb mm7, mm7
+.xloop:
+ movq mm0, [ebx+eax]
+ movq mm1, [ecx+eax]
+ movq mm2, mm0
+
+ movq mm3, [ebx+eax+8]
+ pxor mm0, mm7
+ pxor mm1, mm7
+
+ movq mm4, [ecx+eax+8]
+ movq mm5, mm3
+ pxor mm3, mm7
+
+ pxor mm4, mm7
+ pavgb mm0, mm1
+ pavgb mm3, mm4
+
+ pxor mm0, mm7
+ pxor mm3, mm7
+ pavgb mm0, mm2
+
+ movq [edx+eax], mm0
+ pavgb mm3, mm5
+
+ movq [edx+eax+8], mm3
+ add eax, 16
+ jne .xloop
+
+ pop ebx
+ ret
+
+;==============================================================================
+ global _vdasm_vert_average_17_ISSE
+_vdasm_vert_average_17_ISSE:
+ push ebx
+ mov ebx, [esp+12+4]
+ mov ecx, [esp+8+4]
+ mov edx, [esp+4+4]
+ mov eax, [esp+16+4]
+
+ add ebx, eax
+ add ecx, eax
+ add edx, eax
+ neg eax
+
+ ;r = avgup(avgdown(avgdown(a, b), a), a)
+ ; = pavgb(~pavgb(pavgb(~a, ~b), ~a), a)
+
+ pcmpeqb mm7, mm7
+.xloop:
+ movq mm0, [ecx+eax]
+ movq mm1, [ebx+eax]
+ movq mm2, mm0
+ pxor mm0, mm7 ;~a
+ pxor mm1, mm7 ;~b
+ pavgb mm1, mm0 ;pavgb(~a, ~b) = ~avgdown(a, b)
+ pavgb mm1, mm0 ;pavgb(~avgdown(a, b), ~a) = ~avgdown(avgdown(a, b), a)
+ pxor mm1, mm7 ;avgdown(avgdown(a, b), a)
+ pavgb mm1, mm2 ;pavgb(avgdown(avgdown(a, b), a), a) = round((7*a + b)/8)
+ movq [edx+eax], mm1
+
+ add eax, 8
+ jne .xloop
+
+ pop ebx
+ ret
+
+;==============================================================================
+ global _vdasm_vert_average_35_ISSE
+_vdasm_vert_average_35_ISSE:
+ push ebx
+ mov ebx, [esp+12+4]
+ mov ecx, [esp+8+4]
+ mov edx, [esp+4+4]
+ mov eax, [esp+16+4]
+
+ add ebx, eax
+ add ecx, eax
+ add edx, eax
+ neg eax
+
+ ;r = avgup(avgdown(avgdown(a, b), b), a)
+ ; = pavgb(~pavgb(pavgb(~a, ~b), ~b), a)
+
+ pcmpeqb mm7, mm7
+.xloop:
+ movq mm0, [ecx+eax]
+ movq mm1, [ebx+eax]
+ movq mm2, mm0
+ pxor mm0, mm7 ;~a
+ pxor mm1, mm7 ;~b
+ pavgb mm0, mm1 ;avgup(~a, ~b) = ~avgdown(a, b)
+ pavgb mm0, mm1 ;avgup(~avgdown(a, b), ~b) = ~avgdown(avgdown(a, b), b)
+ pxor mm0, mm7 ;avgdown(avgdown(a, b), b)
+ pavgb mm0, mm2 ;avgup(avgdown(avgdown(a, b), b), a) = round((5*a + 3*b) / 8)
+ movq [edx+eax], mm0
+
+ add eax, 8
+ jne .xloop
+
+ pop ebx
+ ret
+
+;==============================================================================
+ global _vdasm_horiz_expand4x_coaligned_MMX
+_vdasm_horiz_expand4x_coaligned_MMX:
+ mov edx, [esp+4]
+ mov ecx, [esp+8]
+ mov eax, [esp+12]
+ movq mm6, qword [x0002000200020002]
+ pxor mm7, mm7
+.xloop:
+ movd mm0, [ecx]
+ movd mm1, [ecx+1]
+ add ecx, 4
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm1, mm0 ;x1
+ movq mm2, mm1
+ paddw mm1, mm6 ;x1 + 2
+ movq mm3, mm1
+ paddw mm2, mm2 ;x2
+ paddw mm3, mm2 ;x3 + 2
+ paddw mm2, mm6 ;x2 + 2
+ psraw mm1, 2 ;x1/4
+ psraw mm2, 2 ;x2/4
+ psraw mm3, 2 ;x3/4
+ paddw mm1, mm0
+ paddw mm2, mm0
+ paddw mm3, mm0
+ movd mm0, [ecx-4]
+ packuswb mm1, mm1
+ packuswb mm2, mm2
+ packuswb mm3, mm3
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+
+ movq [edx], mm0
+ movq [edx+8], mm1
+ add edx, 16
+ sub eax, 1
+ jne .xloop
+
+ ret
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm
new file mode 100644
index 000000000..3db442fa2
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_mmx.asm
@@ -0,0 +1,326 @@
+ segment .rdata, align=16
+
+x0020w dq 00020002000200020h
+rb_mask_555 dq 07c1f7c1f7c1f7c1fh
+g_mask_555 dq 003e003e003e003e0h
+rb_mask_888 dq 000ff00ff00ff00ffh
+g_mask_888 dq 00000ff000000ff00h
+
+ segment .text
+
+ struc VDPixmapReferenceStretchBltBilinearParameters
+.dst resd 1
+.src resd 1
+.u resd 1
+.uinc resd 1
+.dudx resd 1
+
+.xprepos resd 1
+.xpostpos resd 1
+.xprecopy resd 1
+.xpostcopy resd 1
+.xmidsize resd 1
+ endstruc
+
+
+
+ global _vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX
+_vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+20+16]
+ and eax, 0f8000000h
+ mov ebx, [esp+8+16]
+ mov ecx, [esp+12+16]
+ jz .noreverse
+ xchg ebx, ecx
+ js .noreverse
+ neg eax
+ xchg ebx, ecx
+.noreverse:
+ shr eax, 16
+ mov [esp+20+16], eax
+ mov edx, [esp+4+16]
+ mov eax, [esp+16+16]
+ add eax, eax
+ lea ebx, [ebx+eax-6]
+ lea ecx, [ecx+eax-6]
+ lea edx, [edx+eax-6]
+ neg eax
+
+ movd mm4, dword [esp+20+16]
+ punpcklwd mm4, mm4
+ punpckldq mm4, mm4
+
+ movq mm6, [rb_mask_555]
+ movq mm7, [g_mask_555]
+
+.xstart:
+ add eax, 6
+ jbe .doodd
+.xloop:
+ movq mm0, [ebx+eax]
+ movq mm1, [ecx+eax]
+ movq mm2, mm7
+ movq mm3, mm7
+
+ pand mm2, mm0
+ pand mm3, mm1
+ pand mm0, mm6
+ pand mm1, mm6
+
+ psubw mm3, mm2
+ psubw mm1, mm0
+
+ pmulhw mm3, mm4
+ pmulhw mm1, mm4
+
+ psubw mm0, mm1
+ psubw mm2, mm3
+
+ pand mm0, mm6
+ pand mm2, mm7
+
+ paddw mm0, mm2
+
+ movq [edx+eax], mm0
+ add eax, 8
+ jnc .xloop
+
+.doodd:
+ sub eax, 6
+ jz .noodd
+.odd:
+ movzx esi, word [ebx+eax+6]
+ movd mm0, esi
+ movzx esi, word [ecx+eax+6]
+ movd mm1, esi
+ movq mm2, mm7
+ movq mm3, mm7
+
+ pand mm2, mm0
+ pand mm3, mm1
+ pand mm0, mm6
+ pand mm1, mm6
+
+ psubw mm3, mm2
+ psubw mm1, mm0
+
+ pmulhw mm3, mm4
+ pmulhw mm1, mm4
+
+ psubw mm0, mm1
+ psubw mm2, mm3
+
+ pand mm0, mm6
+ pand mm2, mm7
+
+ paddw mm0, mm2
+
+ movd esi, mm0
+ mov [edx+eax+6], si
+ add eax,2
+ jne .odd
+
+.noodd:
+ emms
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ global _vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX
+_vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov edx, [esp+4+16]
+
+ mov ebx, [edx+VDPixmapReferenceStretchBltBilinearParameters.src]
+ mov edi, [edx+VDPixmapReferenceStretchBltBilinearParameters.dst]
+
+ mov ecx, [edx+VDPixmapReferenceStretchBltBilinearParameters.xprecopy]
+ or ecx, ecx
+ jz .noprecopy
+ mov eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.xprepos]
+ mov eax, [ebx+eax]
+ lea ebp, [ecx*4]
+ sub edi, ebp
+ rep stosd
+.noprecopy:
+ mov ebp, [edx+VDPixmapReferenceStretchBltBilinearParameters.xmidsize]
+ add ebp, ebp
+ add ebp, ebp
+ add edi, ebp
+ neg ebp
+
+ mov esi, [edx+VDPixmapReferenceStretchBltBilinearParameters.u]
+ mov eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.dudx]
+ mov edx, [edx+VDPixmapReferenceStretchBltBilinearParameters.uinc]
+ movd mm2, esi
+ movd mm3, eax
+ shr ebx, 2
+
+ movq mm5, mm2
+ punpcklwd mm5, mm5
+ punpckhdq mm5, mm5
+ movq mm4, mm5
+ psraw mm4, 15
+
+.xloop:
+ movd mm0, dword [ebx*4]
+ pxor mm7, mm7
+ movd mm1, dword [ebx*4+4]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ psubw mm1, mm0
+ pand mm4, mm1
+ pmulhw mm1, mm5
+ paddw mm1, mm4
+ paddw mm0, mm1
+ packuswb mm0, mm0
+ movd dword [edi+ebp], mm0
+
+ add esi, eax
+ adc ebx, edx
+
+ paddd mm2, mm3
+ movq mm5, mm2
+ punpcklwd mm5, mm5
+ punpckhdq mm5, mm5
+ movq mm4, mm5
+ psraw mm4, 15
+ add ebp, 4
+ jnz .xloop
+
+ mov edx, [esp+4+16]
+ mov ecx, [edx+VDPixmapReferenceStretchBltBilinearParameters.xpostcopy]
+ or ecx, ecx
+ jz .nopostcopy
+ mov eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.xpostpos]
+ add eax, [edx+VDPixmapReferenceStretchBltBilinearParameters.src]
+ mov eax, [eax]
+ rep stosd
+.nopostcopy:
+
+ emms
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX
+_vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+20+16]
+ and eax, 0ff000000h
+ mov ebx, [esp+8+16]
+ mov ecx, [esp+12+16]
+ jz .noreverse
+ xchg ebx, ecx
+ js .noreverse
+ neg eax
+ xchg ebx, ecx
+.noreverse:
+ shr eax, 16
+ mov [esp+20+16], eax
+ mov edx, [esp+4+16]
+ mov eax, [esp+16+16]
+ add eax, eax
+ add eax, eax
+ lea ebx, [ebx+eax-4]
+ lea ecx, [ecx+eax-4]
+ lea edx, [edx+eax-4]
+ neg eax
+
+ movd mm4, dword [esp+20+16]
+ punpcklwd mm4, mm4
+ punpckldq mm4, mm4
+
+ movq mm6, [rb_mask_888]
+ movq mm7, [g_mask_888]
+
+.xstart:
+ add eax, 4
+ jbe .doodd
+.xloop:
+ movq mm0, [ebx+eax]
+ movq mm1, [ecx+eax]
+ movq mm2, mm0
+ movq mm3, mm1
+ psrlw mm2, 8
+ psrlw mm3, 8
+ pand mm0, mm6
+ pand mm1, mm6
+
+ psubw mm3, mm2
+ psubw mm1, mm0
+
+ pmulhw mm3, mm4
+ pmulhw mm1, mm4
+
+ psubw mm0, mm1
+ psubw mm2, mm3
+
+ pand mm0, mm6
+
+ psllw mm2, 8
+
+ paddw mm0, mm2
+
+ movq qword [edx+eax], mm0
+ add eax, 8
+ jnc .xloop
+
+.doodd:
+ sub eax, 4
+ jz .noodd
+.odd:
+ movd mm0, dword [ebx]
+ movd mm1, dword [ecx]
+ movq mm2, mm0
+ movq mm3, mm1
+ psrlw mm2, 8
+ psrlw mm3, 8
+ pand mm0, mm6
+ pand mm1, mm6
+
+ psubw mm3, mm2
+ psubw mm1, mm0
+
+ pmulhw mm3, mm4
+ pmulhw mm1, mm4
+
+ psubw mm0, mm1
+ psubw mm2, mm3
+
+ pand mm0, mm6
+
+ psllw mm2, 8
+
+ paddw mm0, mm2
+
+ movd dword [edx], mm0
+
+.noodd:
+ emms
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm
new file mode 100644
index 000000000..dca765b92
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_stretchrgb_point.asm
@@ -0,0 +1,96 @@
+ segment .text
+
+ struc scaleinfo
+.dst resd 1
+.src resd 1
+.xaccum resd 1
+.xfracinc resd 1
+.xintinc resd 1
+.count resd 1
+ endstruc
+
+ global _vdasm_resize_point32
+_vdasm_resize_point32:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+16]
+
+ mov ebx, [eax+scaleinfo.xaccum]
+ mov ecx, [eax+scaleinfo.xfracinc]
+ mov edx, [eax+scaleinfo.src]
+ mov esi, [eax+scaleinfo.xintinc]
+ mov edi, [eax+scaleinfo.dst]
+ mov ebp, [eax+scaleinfo.count]
+.xloop:
+ mov eax,[edx*4]
+ add ebx,ecx
+ adc edx,esi
+ mov [edi+ebp],eax
+ add ebp,4
+ jne .xloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ global _vdasm_resize_point32_MMX
+_vdasm_resize_point32_MMX:
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ mov eax, [esp+4+16]
+
+ push 0
+ push dword [fs:0]
+ mov dword [fs:0], esp
+
+ mov ebx, [eax+scaleinfo.xaccum]
+ mov esp, [eax+scaleinfo.xfracinc]
+ mov edx, [eax+scaleinfo.src]
+ mov esi, [eax+scaleinfo.xintinc]
+ mov edi, [eax+scaleinfo.dst]
+ mov ebp, [eax+scaleinfo.count]
+
+ mov eax, ebx
+ mov ecx, edx
+ add ebx, esp
+ adc edx, esi
+ add esp, esp
+ adc esi, esi
+
+ add ebp, 4
+ jz .odd
+.dualloop:
+ movd mm0, dword [ecx*4]
+ punpckldq mm0,[edx*4]
+ add eax,esp
+ adc ecx,esi
+ add ebx,esp
+ adc edx,esi
+ movq [edi+ebp-4],mm0
+
+ add ebp,8
+ jnc .dualloop
+ jnz .noodd
+.odd:
+ mov eax, [ecx*4]
+ mov [edi-4], eax
+.noodd:
+ mov esp, dword [fs:0]
+ pop eax
+ pop eax
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc
new file mode 100644
index 000000000..fb969c56f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt.inc
@@ -0,0 +1,24 @@
+ struc span
+.u resd 1
+.v resd 1
+ endstruc
+
+ struc mipspan
+.u resd 1
+.v resd 1
+.lambda resd 1
+ endstruc
+
+ struc mipmap
+.bits resd 1
+.pitch resd 1
+.uvmul resd 1
+ resd 1
+ endstruc
+
+ struc texinfo
+.mips resd 16*4
+.dst resd 1
+.src resd 1
+.w resd 1
+ endstruc
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm
new file mode 100644
index 000000000..3836488aa
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_mmx.asm
@@ -0,0 +1,425 @@
+ segment .rdata, align=16
+
+correct dq 0000800000008000h
+round dq 0000200000002000h
+round1 dq 0000020000000200h
+round2 dq 0002000000020000h
+
+ segment .text
+
+ %include "a_triblt.inc"
+
+ extern _kVDCubicInterpTableFX14_075_MMX
+
+;--------------------------------------------------------------------------
+ global _vdasm_triblt_span_bilinear_mmx
+_vdasm_triblt_span_bilinear_mmx:
+ push ebp
+ push edi
+ push esi
+ push ebx
+ mov edi,[esp+4+16]
+ mov edx,[edi+texinfo.dst]
+ mov ebp,[edi+texinfo.w]
+ shl ebp,2
+ mov ebx,[edi+texinfo.mips+mipmap.bits]
+ add edx,ebp
+ mov esi,[edi+texinfo.mips+mipmap.pitch]
+ neg ebp
+ movd mm6,[edi+texinfo.mips+mipmap.uvmul]
+ pxor mm7,mm7
+ mov edi,[edi+texinfo.src]
+.xloop:
+ movq mm4,[edi]
+ movq mm0,mm4
+ psrld mm0,16
+ movq mm5,mm4
+ packssdw mm0,mm0
+ pmaddwd mm0,mm6
+ add edi,8
+ punpcklwd mm4,mm4
+ punpckldq mm4,mm4
+ movd ecx,mm0
+ add ecx,ebx
+ psrlw mm4,1
+ movd mm0,dword [ecx]
+ movd mm1,dword [ecx+4]
+ punpcklbw mm0,mm7
+ movd mm2,dword [ecx+esi]
+ punpcklbw mm1,mm7
+ movd mm3,dword [ecx+esi+4]
+ punpcklbw mm2,mm7
+ punpcklbw mm3,mm7
+ psubw mm1,mm0
+ psubw mm3,mm2
+ paddw mm1,mm1
+ paddw mm3,mm3
+ pmulhw mm1,mm4
+ pmulhw mm3,mm4
+ punpckhwd mm5,mm5
+ punpckldq mm5,mm5
+ paddw mm0,mm1
+ psrlw mm5,1
+ paddw mm2,mm3
+ psubw mm2,mm0
+ paddw mm2,mm2
+ pmulhw mm2,mm5
+ paddw mm0,mm2
+ packuswb mm0,mm0
+ movd dword [edx+ebp],mm0
+ add ebp,4
+ jnc .xloop
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ emms
+ ret
+
+;--------------------------------------------------------------------------
+ global _vdasm_triblt_span_trilinear_mmx
+_vdasm_triblt_span_trilinear_mmx:
+ push ebp
+ push edi
+ push esi
+ push ebx
+ mov esi,[esp+4+16]
+ mov edx,[esi+texinfo.dst]
+ mov ebp,[esi+texinfo.w]
+ shl ebp,2
+ add edx,ebp
+ neg ebp
+ mov edi,[esi+texinfo.src]
+ pxor mm7,mm7
+.xloop:
+ movd mm6,[edi+mipspan.u]
+ punpckldq mm6,[edi+mipspan.v]
+ mov eax,[edi+mipspan.lambda]
+ shr eax,4
+ and eax,byte -16
+ movd mm2,eax
+ psrlq mm2,4
+ psrld mm6,mm2
+ paddd mm6,[correct]
+
+ ;fetch mipmap 1
+ mov ebx,[esi+eax+mipmap.pitch]
+ movd mm1,[esi+eax+mipmap.uvmul]
+ movq mm4,mm6
+ movq mm0,mm6
+ psrld mm0,16
+ packssdw mm0,mm0
+ pmaddwd mm0,mm1
+ movq mm5,mm4
+ punpcklwd mm4,mm4
+ punpckldq mm4,mm4
+ punpckhwd mm5,mm5
+ punpckldq mm5,mm5
+ movd ecx,mm0
+ add ecx,[esi+eax+mipmap.bits]
+ psrlw mm4,1
+ movd mm0,dword [ecx]
+ movd mm1,dword [ecx+4]
+ punpcklbw mm0,mm7
+ movd mm2,dword [ecx+ebx]
+ punpcklbw mm1,mm7
+ movd mm3,dword [ecx+ebx+4]
+ punpcklbw mm2,mm7
+ punpcklbw mm3,mm7
+ psubw mm1,mm0
+ psubw mm3,mm2
+ paddw mm1,mm1
+ paddw mm3,mm3
+ pmulhw mm1,mm4
+ pmulhw mm3,mm4
+ paddw mm0,mm1
+ psrlw mm5,1
+ paddw mm2,mm3
+ psubw mm2,mm0
+ paddw mm2,mm2
+ pmulhw mm2,mm5
+ paddw mm0,mm2
+
+ ;fetch mipmap 2
+ mov ebx,[esi+eax+16+mipmap.pitch]
+ movd mm1,[esi+eax+16+mipmap.uvmul]
+ paddd mm6,[correct]
+ psrld mm6,1
+ movq mm4,mm6
+ psrld mm6,16
+ packssdw mm6,mm6
+ pmaddwd mm6,mm1
+ movq mm5,mm4
+ punpcklwd mm4,mm4
+ punpckldq mm4,mm4
+ punpckhwd mm5,mm5
+ punpckldq mm5,mm5
+ movd ecx,mm6
+ add ecx,[esi+eax+16+mipmap.bits]
+ psrlw mm4,1
+ movd mm6,dword [ecx]
+ movd mm1,dword [ecx+4]
+ punpcklbw mm6,mm7
+ movd mm2,dword [ecx+ebx]
+ punpcklbw mm1,mm7
+ movd mm3,dword [ecx+ebx+4]
+ punpcklbw mm2,mm7
+ punpcklbw mm3,mm7
+ psubw mm1,mm6
+ psubw mm3,mm2
+ paddw mm1,mm1
+ paddw mm3,mm3
+ pmulhw mm1,mm4
+ pmulhw mm3,mm4
+ paddw mm6,mm1
+ psrlw mm5,1
+ paddw mm2,mm3
+ psubw mm2,mm6
+ paddw mm2,mm2
+ pmulhw mm2,mm5
+ paddw mm6,mm2
+
+ ;blend mips
+ movd mm1,[edi+mipspan.lambda]
+ punpcklwd mm1,mm1
+ punpckldq mm1,mm1
+ psllw mm1,8
+ psrlq mm1,1
+ psubw mm6,mm0
+ paddw mm6,mm6
+ pmulhw mm6,mm1
+ paddw mm0,mm6
+ packuswb mm0,mm0
+
+ movd dword [edx+ebp],mm0
+ add edi, mipspan_size
+ add ebp,4
+ jnc .xloop
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ emms
+ ret
+
+;--------------------------------------------------------------------------
+%macro .SETUPADDR 1
+ ;compute mipmap index and UV
+ movd mm0, [edi + mipspan.u]
+ punpckldq mm0, [edi + mipspan.v]
+ mov ebx, [edi + mipspan.lambda]
+ shr ebx, 4
+ and ebx, byte -16
+
+ add ebx, mipmap_size*%1
+ movd mm2, ebx
+ add ebx, [esp + .af_mipbase]
+ psrlq mm2, 4
+ psrad mm0, mm2
+ paddd mm0, [correct]
+ movq mm1, mm0
+ psrlq mm1, 32
+
+ ;compute horizontal filters
+ movd ecx, mm0
+ shr ecx, 4
+ and ecx, 0ff0h
+ add ecx, _kVDCubicInterpTableFX14_075_MMX
+
+ ;compute vertical filter
+ movd edx, mm1
+ and edx, 0ff00h
+ shr edx, 4
+ add edx, _kVDCubicInterpTableFX14_075_MMX
+
+ ;compute texel address
+ movd mm1, [ebx + mipmap.uvmul]
+ psrld mm0, 16
+ packssdw mm0, mm0
+ pmaddwd mm0, mm1
+ movd eax, mm0
+ add eax, [ebx + mipmap.bits]
+%endmacro
+
+%macro .HCUBIC 4
+ movd %1, dword [eax]
+ punpcklbw %1, qword [eax+4]
+ movd %3, dword [eax+8]
+ punpcklbw %3, qword [eax+12]
+ movq %2, %1
+ movq %4, %3
+ punpcklbw %1, mm7
+ pmaddwd %1, [ecx]
+ punpcklbw %3, mm7
+ pmaddwd %3, [ecx+8]
+ punpckhbw %2, mm7
+ pmaddwd %2, [ecx]
+ punpckhbw %4, mm7
+ pmaddwd %4, [ecx+8]
+ paddd %1, %3
+ paddd %2, %4
+%endmacro
+
+%macro .VCUBIC 1
+ .HCUBIC mm0, mm1, mm2, mm3
+ add eax, %1
+
+ .HCUBIC mm4, mm5, mm2, mm3
+ add eax, %1
+
+ movq mm2, [round1]
+
+ paddd mm0, mm2
+ paddd mm1, mm2
+ paddd mm4, mm2
+ paddd mm5, mm2
+
+ psrad mm0, 10
+ psrad mm1, 10
+ psrad mm4, 10
+ psrad mm5, 10
+
+ packssdw mm0, mm0
+ packssdw mm1, mm1
+ packssdw mm4, mm4
+ packssdw mm5, mm5
+
+ punpcklwd mm0, mm4
+ punpcklwd mm1, mm5
+
+ movq mm3, [edx]
+
+ pmaddwd mm0, mm3
+ pmaddwd mm1, mm3
+
+ movq [esp + .af_htemp0], mm0
+ movq [esp + .af_htemp1], mm1
+
+ .HCUBIC mm0, mm1, mm2, mm3
+ add eax, %1
+ .HCUBIC mm4, mm5, mm2, mm3
+
+ movq mm2, [round1]
+
+ paddd mm0, mm2
+ paddd mm1, mm2
+ paddd mm4, mm2
+ paddd mm5, mm2
+
+ psrad mm0, 10
+ psrad mm1, 10
+ psrad mm4, 10
+ psrad mm5, 10
+
+ packssdw mm0, mm0
+ packssdw mm1, mm1
+ packssdw mm4, mm4
+ packssdw mm5, mm5
+
+ punpcklwd mm0, mm4
+ punpcklwd mm1, mm5
+
+ movq mm2, [round2]
+ movq mm3, [edx + 8]
+
+ pmaddwd mm0, mm3
+ pmaddwd mm1, mm3
+
+ paddd mm0, [esp + .af_htemp0]
+ paddd mm1, [esp + .af_htemp1]
+
+ paddd mm0, mm2
+ paddd mm1, mm2
+
+ psrad mm0, 18
+ psrad mm1, 18
+ packssdw mm0, mm1
+%endmacro
+
+ global _vdasm_triblt_span_bicubic_mip_linear_mmx
+_vdasm_triblt_span_bicubic_mip_linear_mmx:
+
+;parameters
+%define .p_texinfo 20
+
+;aligned frame
+%define .af_htemp0 0
+%define .af_htemp1 8
+%define .af_vtemp0 16
+%define .af_mipbase 24
+%define .af_prevesp 28
+%define .afsize 32
+
+ push ebp
+ lea ebp, [esp-12]
+ push edi
+ push esi
+ push ebx
+
+ sub esp, .afsize
+ and esp, -8
+
+ mov [esp + .af_prevesp], ebp
+
+ mov ebx, [ebp + .p_texinfo]
+ mov ebp, [ebx + texinfo.dst]
+ mov esi, [ebx + texinfo.w]
+ shl esi, 2
+ add ebp,esi
+ neg esi
+
+ mov edi, [ebx + texinfo.src]
+ mov [esp + .af_mipbase], ebx
+ pxor mm7, mm7
+
+.xloop:
+
+ ;registers:
+ ; eax base texel address
+ ; ebx first mip info
+ ; ecx horizontal filter
+ ; edx vertical filter
+ ; esi horizontal count
+ ; edi mipspan
+ ; ebp destination
+
+ ;fetch mipmap 1
+ .SETUPADDR 0
+ .VCUBIC [ebx+mipmap.pitch]
+
+ movq [esp + .af_vtemp0], mm0
+
+ ;fetch mipmap 2
+ .SETUPADDR 1
+ .VCUBIC [ebx+mipmap.pitch]
+
+ ;blend mips
+ movq mm1, [esp + .af_vtemp0]
+
+ psubw mm0, mm1
+
+ movd mm3,[edi+mipspan.lambda]
+ punpcklwd mm3,mm3
+ punpckldq mm3,mm3
+ psllw mm3,8
+ psrlq mm3,1
+
+ paddw mm0,mm0
+ pmulhw mm0,mm3
+ paddw mm0,mm1
+ packuswb mm0,mm0
+
+ movd dword [ebp+esi],mm0
+ add edi, mipspan_size
+ add esi,4
+ jnc .xloop
+
+ mov esp, [esp + .af_prevesp]
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ emms
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm
new file mode 100644
index 000000000..c550634f3
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_scalar.asm
@@ -0,0 +1,36 @@
+ segment .text
+
+ %include "a_triblt.inc"
+
+ global _vdasm_triblt_span_point
+_vdasm_triblt_span_point:
+ push ebp
+ push edi
+ push esi
+ push ebx
+ mov eax,[esp+4+16]
+ mov ebp,[eax+texinfo.w]
+ mov ebx,[eax+texinfo.mips+mipmap.pitch]
+ shl ebp,2
+ mov edi,[eax+texinfo.src]
+ mov edx,[eax+texinfo.dst]
+ mov ecx,[eax+texinfo.mips+mipmap.bits]
+ sar ebx,2
+ add edx,ebp
+ neg ebp
+.xloop:
+ mov eax,[edi+span.v]
+ imul eax,ebx
+ add eax,[edi+span.u]
+ add edi,8
+ mov eax,[ecx+eax*4]
+ mov [edx+ebp],eax
+ add ebp,4
+ jnc .xloop
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm
new file mode 100644
index 000000000..54514b317
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/a_triblt_sse2.asm
@@ -0,0 +1,197 @@
+ segment .rdata, align=16
+
+correct dq 0000800000008000h, 0000800000008000h
+round dq 0000200000002000h, 0000200000002000h
+round1 dq 0000020000000200h, 0000020000000200h
+round2 dq 0002000000020000h, 0002000000020000h
+
+ segment .text
+
+ %include "a_triblt.inc"
+
+ extern _kVDCubicInterpTableFX14_075_MMX
+
+;--------------------------------------------------------------------------
+ global _vdasm_triblt_span_bicubic_mip_linear_sse2
+_vdasm_triblt_span_bicubic_mip_linear_sse2:
+
+;parameters
+%define .p_texinfo 20
+
+;aligned frame
+%define .af_vtemp0 0
+%define .af_mipbase 16
+%define .af_prevesp 20
+%define .afsize 24
+
+ push ebp
+ lea ebp, [esp-12]
+ push edi
+ push esi
+ push ebx
+
+ sub esp, .afsize
+ and esp, -16
+
+ mov [esp + .af_prevesp], ebp
+
+ mov ebx, [ebp + .p_texinfo]
+ mov ebp, [ebx + texinfo.dst]
+ mov esi, [ebx + texinfo.w]
+ shl esi, 2
+ add ebp,esi
+ neg esi
+
+ mov edi, [ebx + texinfo.src]
+ mov [esp + .af_mipbase], ebx
+ pxor xmm7, xmm7
+
+.xloop:
+
+ ;registers:
+ ; eax base texel address
+ ; ebx first mip info
+ ; ecx horizontal filter
+ ; edx vertical filter
+ ; esi horizontal count
+ ; edi mipspan
+ ; ebp destination
+
+%macro .SETUPADDR 1
+ ;compute mipmap index and UV
+ movd xmm0, [edi + mipspan.u]
+ movd xmm1, [edi + mipspan.v]
+ punpckldq xmm0, xmm1
+ mov ebx, [edi + mipspan.lambda]
+ shr ebx, 4
+ and ebx, byte -16
+
+ add ebx, mipmap_size*%1
+ movd xmm2, ebx
+ add ebx, [esp + .af_mipbase]
+ psrlq xmm2, 4
+ psrad xmm0, xmm2
+ paddd xmm0, [correct]
+ pshufd xmm1, xmm0, 01010101b
+
+ ;compute horizontal filters
+ movd ecx, xmm0
+ shr ecx, 4
+ and ecx, 0ff0h
+ add ecx, _kVDCubicInterpTableFX14_075_MMX
+
+ ;compute vertical filter
+ movd edx, xmm1
+ and edx, 0ff00h
+ shr edx, 4
+ add edx, _kVDCubicInterpTableFX14_075_MMX
+
+ ;compute texel address
+ movd xmm1, [ebx + mipmap.uvmul]
+ psrld xmm0, 16
+ packssdw xmm0, xmm0
+ pmaddwd xmm0, xmm1
+ movd eax, xmm0
+ add eax, [ebx + mipmap.bits]
+%endmacro
+
+%macro .HCUBIC 4
+ movd %1, dword [eax]
+ movd %3, dword [eax+4]
+ movd %2, dword [eax+8]
+ movd %4, dword [eax+12]
+ punpcklbw %1, %3
+ punpcklbw %2, %4
+ punpcklbw %1, xmm7
+ punpcklbw %2, xmm7
+ movdqa %3, [ecx]
+ pshufd %4, %3, 11101110b
+ pshufd %3, %3, 01000100b
+ pmaddwd %1, %3
+ pmaddwd %2, %4
+ paddd %1, %2
+%endmacro
+
+%macro .VCUBIC 1
+ .HCUBIC xmm0, xmm4, xmm5, xmm6
+ add eax, %1
+ .HCUBIC xmm1, xmm4, xmm5, xmm6
+ add eax, %1
+ .HCUBIC xmm2, xmm4, xmm5, xmm6
+ add eax, %1
+ .HCUBIC xmm3, xmm4, xmm5, xmm6
+
+ movq xmm4, [round1]
+
+ paddd xmm0, xmm4
+
+ paddd xmm1, xmm4
+ psrad xmm0, 10
+
+ paddd xmm2, xmm4
+ psrad xmm1, 10
+ packssdw xmm0, xmm0
+
+ paddd xmm3, xmm4
+ psrad xmm2, 10
+ packssdw xmm1, xmm1
+
+ movdqa xmm5, [edx]
+ psrad xmm3, 10
+ punpcklwd xmm0, xmm1
+
+ packssdw xmm2, xmm2
+ packssdw xmm3, xmm3
+ pshufd xmm4, xmm5, 01000100b
+
+ pmaddwd xmm0, xmm4
+ punpcklwd xmm2, xmm3
+
+ pshufd xmm5, xmm5, 11101110b
+
+ pmaddwd xmm2, xmm5
+ paddd xmm0, xmm2
+ paddd xmm0, [round2]
+ psrad xmm0, 18
+
+ packssdw xmm0, xmm0
+%endmacro
+
+ ;fetch mipmap 1
+ .SETUPADDR 0
+ .VCUBIC [ebx+mipmap.pitch]
+
+ movq [esp + .af_vtemp0], xmm0
+
+ ;fetch mipmap 2
+ .SETUPADDR 1
+ .VCUBIC [ebx+mipmap.pitch]
+
+ ;blend mips
+ movq xmm1, [esp + .af_vtemp0]
+
+ psubw xmm0, xmm1
+
+ movd xmm3, [edi+mipspan.lambda]
+ pshuflw xmm3, xmm3, 0
+ psllw xmm3, 8
+ psrlq xmm3, 1
+
+ paddw xmm0, xmm0
+ pmulhw xmm0, xmm3
+ paddw xmm0, xmm1
+ packuswb xmm0, xmm0
+
+ movd dword [ebp+esi], xmm0
+ add edi, mipspan_size
+ add esi,4
+ jnc .xloop
+
+ mov esp, [esp + .af_prevesp]
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+ end
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp
new file mode 100644
index 000000000..a292ca2bd
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/alphablt.cpp
@@ -0,0 +1,76 @@
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+
+void VDPixmapBltAlphaConst8(uint8 *dst, ptrdiff_t dstpitch, const uint8 *src, ptrdiff_t srcpitch, uint32 w, uint32 h, uint32 ialpha);
+
+bool VDPixmapBltAlphaConst(const VDPixmap& dst, const VDPixmap& src, float alpha) {
+ if (!(alpha >= 0.0f))
+ alpha = 0.0f;
+ else if (!(alpha <= 1.0f))
+ alpha = 1.0f;
+
+ uint32 ialpha = VDRoundToInt32(alpha * 256.0f);
+
+ // format check
+ if (dst.format != src.format || !src.format)
+ return false;
+
+ // degenerate case check
+ if (!dst.w || !dst.h)
+ return false;
+
+ // size check
+ if (src.w != dst.w || src.h != dst.h)
+ return false;
+
+ // check for formats that are not 8bpp
+ switch(src.format) {
+ case nsVDPixmap::kPixFormat_Pal1:
+ case nsVDPixmap::kPixFormat_Pal2:
+ case nsVDPixmap::kPixFormat_Pal4:
+ case nsVDPixmap::kPixFormat_Pal8:
+ case nsVDPixmap::kPixFormat_RGB565:
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ return false;
+ }
+
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(src.format);
+
+ const int qw = -(-dst.w >> formatInfo.qwbits);
+ const int qh = -(-dst.h >> formatInfo.qhbits);
+ const int auxw = -(-dst.w >> formatInfo.auxwbits);
+ const int auxh = -(-dst.h >> formatInfo.auxhbits);
+
+ switch(formatInfo.auxbufs) {
+ case 2:
+ VDPixmapBltAlphaConst8((uint8 *)dst.data3, dst.pitch3, (const uint8 *)src.data3, src.pitch3, auxw, auxh, ialpha);
+ case 1:
+ VDPixmapBltAlphaConst8((uint8 *)dst.data2, dst.pitch2, (const uint8 *)src.data2, src.pitch2, auxw, auxh, ialpha);
+ case 0:
+ VDPixmapBltAlphaConst8((uint8 *)dst.data, dst.pitch, (const uint8 *)src.data, src.pitch, formatInfo.qsize * qw, qh, ialpha);
+ }
+
+ return true;
+}
+
+void VDPixmapBltAlphaConst8(uint8 *dst, ptrdiff_t dstpitch, const uint8 *src, ptrdiff_t srcpitch, uint32 w, uint32 h, uint32 ialpha) {
+ dstpitch -= w;
+ srcpitch -= w;
+ do {
+ uint32 w2 = w;
+ do {
+ sint32 sc = *src;
+ sint32 dc = *dst;
+
+ *dst = dc + (((sc-dc)*ialpha + 128) >> 8);
+ ++src;
+ ++dst;
+ } while(--w2);
+
+ src += srcpitch;
+ dst += dstpitch;
+ } while(--h);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp
new file mode 100644
index 000000000..75e5542a9
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt.cpp
@@ -0,0 +1,273 @@
+#include <vector>
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+
+#if _MSC_VER >= 1300
+ #define VDNOINLINE __declspec(noinline)
+#else
+ #define VDNOINLINE
+#endif
+
+using namespace nsVDPixmap;
+
+namespace {
+ typedef void (*tpPalettedBlitter)(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal);
+ typedef void (*tpChunkyBlitter)(void *dst, ptrdiff_t dstpitch, const void *src, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+ typedef void (*tpPlanarBlitter)(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+}
+
+bool VDPixmapBltDirect(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+void VDPixmapBltDirectPalettedConversion(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h, tpPalettedBlitter pBlitter) {
+ uint8 palbytes[256 * 3];
+
+ int palsize;
+
+ switch(src.format) {
+ case kPixFormat_Pal1:
+ palsize = 2;
+ break;
+ case kPixFormat_Pal2:
+ palsize = 4;
+ break;
+ case kPixFormat_Pal4:
+ palsize = 16;
+ break;
+ case kPixFormat_Pal8:
+ palsize = 256;
+ break;
+ default:
+ VDNEVERHERE;
+ }
+
+ VDASSERT(src.palette);
+
+ VDPixmap srcpal = { (void *)src.palette, NULL, palsize, 1, 0, kPixFormat_XRGB8888 };
+ VDPixmap dstpal = { palbytes, NULL, palsize, 1, 0, dst.format };
+
+ VDVERIFY(VDPixmapBltDirect(dstpal, srcpal, palsize, 1));
+
+ pBlitter(dst.data, dst.pitch, src.data, src.pitch, w, h, palbytes);
+}
+
+tpVDPixBltTable VDPixmapGetBlitterTable() {
+#if defined(_WIN32) && defined(_M_IX86)
+ static tpVDPixBltTable pBltTable;
+
+ if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX) {
+ return VDGetPixBltTableX86MMX();
+ } else {
+ return VDGetPixBltTableX86Scalar();
+ }
+#else
+ static tpVDPixBltTable pBltTable = VDGetPixBltTableReference();
+ return pBltTable;
+#endif
+}
+
+bool VDPixmapBltDirect(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+ if ((unsigned)src.format >= kPixFormat_Max_Standard) {
+ VDASSERT(false);
+ return false;
+ }
+
+ if ((unsigned)dst.format >= kPixFormat_Max_Standard) {
+ VDASSERT(false);
+ return false;
+ }
+
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+
+ if (src.format == dst.format) {
+ int qw = w;
+ int qh = h;
+
+ if (srcinfo.qchunky) {
+ qw = (qw + srcinfo.qw - 1) / srcinfo.qw;
+ qh = -(-h >> srcinfo.qhbits);
+ }
+
+ const int auxw = -(-w >> srcinfo.auxwbits);
+ const int auxh = -(-h >> srcinfo.auxhbits);
+
+ switch(srcinfo.auxbufs) {
+ case 2:
+ VDMemcpyRect(dst.data3, dst.pitch3, src.data3, src.pitch3, srcinfo.auxsize * auxw, auxh);
+ case 1:
+ VDMemcpyRect(dst.data2, dst.pitch2, src.data2, src.pitch2, srcinfo.auxsize * auxw, auxh);
+ case 0:
+ VDMemcpyRect(dst.data, dst.pitch, src.data, src.pitch, srcinfo.qsize * qw, qh);
+ }
+
+ return true;
+ }
+
+ VDPixmapBlitterFn pBlitter = VDPixmapGetBlitterTable()[src.format][dst.format];
+
+ if (!pBlitter)
+ return false;
+
+ pBlitter(dst, src, w, h);
+ return true;
+}
+
+bool VDPixmapIsBltPossible(int dst_format, int src_format) {
+ if (src_format == dst_format)
+ return true;
+
+ tpVDPixBltTable tab(VDPixmapGetBlitterTable());
+
+ if (tab[src_format][dst_format])
+ return true;
+
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src_format);
+ const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dst_format);
+
+ if (srcinfo.auxbufs > 0 || dstinfo.auxbufs > 0)
+ return false; // fail, planar buffers involved (can't do scanlines independently)
+
+ return (tab[src_format][kPixFormat_YUV444_XVYU] && tab[kPixFormat_YUV444_XVYU][dst_format])
+ ||(tab[src_format][kPixFormat_XRGB8888] && tab[kPixFormat_XRGB8888][dst_format]);
+}
+
+bool VDNOINLINE VDPixmapBltTwoStage(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+ const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dst.format);
+
+ if (srcinfo.auxbufs > 0 || dstinfo.auxbufs > 0)
+ return false; // fail, planar buffers involved
+
+ if (srcinfo.qh > 1)
+ return false; // fail, vertically packed formats involved
+
+ if (srcinfo.palsize)
+ return false; // fail, paletted formats involved
+
+ // Allocate a 4xW buffer and try round-tripping through either
+ // RGB32 or XYVU.
+ vdblock<uint32> tempBuf;
+
+ tpVDPixBltTable tab(VDPixmapGetBlitterTable());
+
+ VDPixmap linesrc(src);
+ VDPixmap linedst(dst);
+ VDPixmap linetmp = {};
+
+ if (w < 1024) {
+ linetmp.data = _alloca(sizeof(uint32) * w);
+ } else {
+ tempBuf.resize(w + 1);
+ linetmp.data = tempBuf.data();
+ }
+ linetmp.pitch = 0;
+ linetmp.format = kPixFormat_YUV444_XVYU;
+ linetmp.w = w;
+ linetmp.h = 1;
+
+ VDPixmapBlitterFn pb1 = tab[src.format][kPixFormat_YUV444_XVYU];
+ VDPixmapBlitterFn pb2 = tab[kPixFormat_YUV444_XVYU][dst.format];
+ if (!pb1 || !pb2) {
+ pb1 = tab[src.format][kPixFormat_XRGB8888];
+ pb2 = tab[kPixFormat_XRGB8888][dst.format];
+ if (!pb1 || !pb2)
+ return false;
+
+ linetmp.format = kPixFormat_XRGB8888;
+ }
+
+ do {
+ pb1(linetmp, linesrc, w, 1);
+ pb2(linedst, linetmp, w, 1);
+ vdptrstep(linesrc.data, linesrc.pitch);
+ vdptrstep(linedst.data, linedst.pitch);
+ } while(--h);
+ return true;
+}
+
+bool VDPixmapBltFast(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+ if (VDPixmapBltDirect(dst, src, w, h))
+ return true;
+
+ // Oro... let's see if we can do a two-stage conversion.
+ return VDPixmapBltTwoStage(dst, src, w, h);
+}
+
+bool VDPixmapBlt(const VDPixmap& dst, const VDPixmap& src) {
+ vdpixsize w = std::min<vdpixsize>(src.w, dst.w);
+ vdpixsize h = std::min<vdpixsize>(src.h, dst.h);
+
+ if (!w || !h)
+ return true;
+
+ return VDPixmapBltFast(dst, src, w, h);
+}
+
+bool VDPixmapBlt(const VDPixmap& dst, vdpixpos x1, vdpixpos y1, const VDPixmap& src, vdpixpos x2, vdpixpos y2, vdpixsize w, vdpixsize h) {
+ if (x1 < 0) {
+ x2 -= x1;
+ w -= x1;
+ x1 = 0;
+ }
+
+ if (y1 < 0) {
+ y2 -= y1;
+ h -= y1;
+ y1 = 0;
+ }
+
+ if (x2 < 0) {
+ x1 -= x2;
+ w -= x2;
+ x2 = 0;
+ }
+
+ if (y2 < 0) {
+ y1 -= y2;
+ h -= y2;
+ y2 = 0;
+ }
+
+ if (w > dst.w - x1)
+ w = dst.w - x1;
+
+ if (h > dst.h - y1)
+ h = dst.h - y1;
+
+ if (w > src.w - x2)
+ w = src.w - x2;
+
+ if (h > src.h - y2)
+ h = src.h - y2;
+
+ if (w>=0 && h >= 0) {
+ VDPixmap dst2(VDPixmapOffset(dst, x1, y1));
+ VDPixmap src2(VDPixmapOffset(src, x2, y2));
+
+ return VDPixmapBltFast(dst2, src2, w, h);
+ }
+
+ return true;
+}
+
+extern bool VDPixmapStretchBltNearest_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2);
+extern bool VDPixmapStretchBltBilinear_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2);
+
+bool VDPixmapStretchBltNearest(const VDPixmap& dst, const VDPixmap& src) {
+ return VDPixmapStretchBltNearest(dst, 0, 0, dst.w<<16, dst.h<<16, src, 0, 0, src.w<<16, src.h<<16);
+}
+
+bool VDPixmapStretchBltNearest(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+ return VDPixmapStretchBltNearest_reference(dst, x1, y1, x2, y2, src, u1, v1, u2, v2);
+}
+
+bool VDPixmapStretchBltBilinear(const VDPixmap& dst, const VDPixmap& src) {
+ return VDPixmapStretchBltBilinear(dst, 0, 0, dst.w<<16, dst.h<<16, src, 0, 0, src.w<<16, src.h<<16);
+}
+
+bool VDPixmapStretchBltBilinear(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+ return VDPixmapStretchBltBilinear_reference(dst, x1, y1, x2, y2, src, u1, v1, u2, v2);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp
new file mode 100644
index 000000000..c4dccce9f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference.cpp
@@ -0,0 +1,259 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_setup.h"
+
+#define DECLARE_PALETTED(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0)
+#define DECLARE_RGB(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+DECLARE_RGB(RGB565, XRGB1555);
+DECLARE_RGB(RGB888, XRGB1555);
+DECLARE_RGB(XRGB8888, XRGB1555);
+DECLARE_RGB(XRGB1555, RGB565);
+DECLARE_RGB(RGB888, RGB565);
+DECLARE_RGB(XRGB8888, RGB565);
+DECLARE_RGB(XRGB1555, RGB888);
+DECLARE_RGB(RGB565, RGB888);
+DECLARE_RGB(XRGB8888, RGB888);
+DECLARE_RGB(XRGB1555, XRGB8888);
+DECLARE_RGB(RGB565, XRGB8888);
+DECLARE_RGB(RGB888, XRGB8888);
+
+DECLARE_PALETTED(Pal1, Any8);
+DECLARE_PALETTED(Pal1, Any16);
+DECLARE_PALETTED(Pal1, Any24);
+DECLARE_PALETTED(Pal1, Any32);
+DECLARE_PALETTED(Pal2, Any8);
+DECLARE_PALETTED(Pal2, Any16);
+DECLARE_PALETTED(Pal2, Any24);
+DECLARE_PALETTED(Pal2, Any32);
+DECLARE_PALETTED(Pal4, Any8);
+DECLARE_PALETTED(Pal4, Any16);
+DECLARE_PALETTED(Pal4, Any24);
+DECLARE_PALETTED(Pal4, Any32);
+DECLARE_PALETTED(Pal8, Any8);
+DECLARE_PALETTED(Pal8, Any16);
+DECLARE_PALETTED(Pal8, Any24);
+DECLARE_PALETTED(Pal8, Any32);
+
+DECLARE_YUV(XVYU, UYVY);
+DECLARE_YUV(XVYU, YUYV);
+DECLARE_YUV(Y8, UYVY);
+DECLARE_YUV(Y8, YUYV);
+DECLARE_YUV(UYVY, Y8);
+DECLARE_YUV(YUYV, Y8);
+DECLARE_YUV(UYVY, YUYV);
+DECLARE_YUV_PLANAR(YUV411, YV12);
+
+DECLARE_YUV(UYVY, XRGB1555);
+DECLARE_YUV(UYVY, RGB565);
+DECLARE_YUV(UYVY, RGB888);
+DECLARE_YUV(UYVY, XRGB8888);
+DECLARE_YUV(YUYV, XRGB1555);
+DECLARE_YUV(YUYV, RGB565);
+DECLARE_YUV(YUYV, RGB888);
+DECLARE_YUV(YUYV, XRGB8888);
+DECLARE_YUV(Y8, XRGB1555);
+DECLARE_YUV(Y8, RGB565);
+DECLARE_YUV(Y8, RGB888);
+DECLARE_YUV(Y8, XRGB8888);
+
+DECLARE_YUV_REV(XRGB1555, Y8);
+DECLARE_YUV_REV(RGB565, Y8);
+DECLARE_YUV_REV(RGB888, Y8);
+DECLARE_YUV_REV(XRGB8888, Y8);
+
+DECLARE_YUV_REV(XRGB1555, XVYU);
+DECLARE_YUV_REV(RGB565, XVYU);
+DECLARE_YUV_REV(RGB888, XVYU);
+DECLARE_YUV_REV(XRGB8888, XVYU);
+
+DECLARE_YUV_PLANAR(YV12, XRGB1555);
+DECLARE_YUV_PLANAR(YV12, RGB565);
+DECLARE_YUV_PLANAR(YV12, RGB888);
+DECLARE_YUV_PLANAR(YV12, XRGB8888);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555);
+DECLARE_YUV_PLANAR(YUV411, RGB565);
+DECLARE_YUV_PLANAR(YUV411, RGB888);
+DECLARE_YUV_PLANAR(YUV411, XRGB8888);
+
+extern void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_UberblitAdapter(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+using namespace nsVDPixmap;
+
+void VDPixmapInitBlittersReference(VDPixmapBlitterTable& table) {
+ // use uberblit as the baseline
+ VDPixmapFormatSubset uberblitSrcFormats;
+ VDPixmapFormatSubset uberblitDstFormats;
+
+ uberblitSrcFormats =
+ kPixFormat_Pal1,
+ kPixFormat_Pal2,
+ kPixFormat_Pal4,
+ kPixFormat_Pal8,
+ kPixFormat_XRGB1555,
+ kPixFormat_RGB565,
+ kPixFormat_RGB888,
+ kPixFormat_XRGB8888,
+ kPixFormat_Y8,
+ kPixFormat_YUV422_UYVY,
+ kPixFormat_YUV422_YUYV,
+ kPixFormat_YUV444_XVYU,
+ kPixFormat_YUV444_Planar,
+ kPixFormat_YUV422_Planar,
+ kPixFormat_YUV422_Planar_16F,
+ kPixFormat_YUV420_Planar,
+ kPixFormat_YUV411_Planar,
+ kPixFormat_YUV410_Planar,
+ kPixFormat_YUV422_Planar_Centered,
+ kPixFormat_YUV420_Planar_Centered,
+ kPixFormat_YUV422_V210,
+ kPixFormat_YUV422_UYVY_709,
+ kPixFormat_YUV420_NV12;
+
+ uberblitDstFormats =
+ kPixFormat_XRGB1555,
+ kPixFormat_RGB565,
+ kPixFormat_RGB888,
+ kPixFormat_XRGB8888,
+ kPixFormat_Y8,
+ kPixFormat_YUV422_UYVY,
+ kPixFormat_YUV422_YUYV,
+ kPixFormat_YUV444_XVYU,
+ kPixFormat_YUV444_Planar,
+ kPixFormat_YUV422_Planar,
+ kPixFormat_YUV422_Planar_16F,
+ kPixFormat_YUV420_Planar,
+ kPixFormat_YUV411_Planar,
+ kPixFormat_YUV410_Planar,
+ kPixFormat_YUV422_Planar_Centered,
+ kPixFormat_YUV420_Planar_Centered,
+ kPixFormat_YUV422_V210,
+ kPixFormat_YUV422_UYVY_709,
+ kPixFormat_YUV420_NV12;
+
+ table.AddBlitter(uberblitSrcFormats, uberblitDstFormats, VDPixmapBlt_UberblitAdapter);
+
+ // standard formats
+
+ table.AddBlitter(kPixFormat_Pal1, kPixFormat_Y8, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any8_reference>);
+ table.AddBlitter(kPixFormat_Pal1, kPixFormat_XRGB1555, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal1, kPixFormat_RGB565, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal1, kPixFormat_RGB888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any24_reference>);
+ table.AddBlitter(kPixFormat_Pal1, kPixFormat_XRGB8888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal1_to_Any32_reference>);
+ table.AddBlitter(kPixFormat_Pal2, kPixFormat_Y8, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any8_reference>);
+ table.AddBlitter(kPixFormat_Pal2, kPixFormat_XRGB1555, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal2, kPixFormat_RGB565, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal2, kPixFormat_RGB888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any24_reference>);
+ table.AddBlitter(kPixFormat_Pal2, kPixFormat_XRGB8888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal2_to_Any32_reference>);
+ table.AddBlitter(kPixFormat_Pal4, kPixFormat_Y8, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any8_reference>);
+ table.AddBlitter(kPixFormat_Pal4, kPixFormat_XRGB1555, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal4, kPixFormat_RGB565, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal4, kPixFormat_RGB888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any24_reference>);
+ table.AddBlitter(kPixFormat_Pal4, kPixFormat_XRGB8888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal4_to_Any32_reference>);
+ table.AddBlitter(kPixFormat_Pal8, kPixFormat_Y8, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any8_reference>);
+ table.AddBlitter(kPixFormat_Pal8, kPixFormat_XRGB1555, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal8, kPixFormat_RGB565, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any16_reference>);
+ table.AddBlitter(kPixFormat_Pal8, kPixFormat_RGB888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any24_reference>);
+ table.AddBlitter(kPixFormat_Pal8, kPixFormat_XRGB8888, VDPixmapBlitterPalettedAdapter<VDPixmapBlt_Pal8_to_Any32_reference>);
+
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_RGB888_reference>);
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_XRGB8888_reference>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_RGB888_reference>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XRGB8888_reference>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XRGB8888_reference>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_RGB888_reference>);
+
+ table.AddBlitter(kPixFormat_YUV444_XVYU, kPixFormat_YUV422_UYVY, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XVYU_to_UYVY_reference>);
+ table.AddBlitter(kPixFormat_YUV444_XVYU, kPixFormat_YUV422_YUYV, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XVYU_to_YUYV_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_YUV422_UYVY, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_UYVY_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_YUV422_YUYV, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_YUYV_reference>);
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_Y8_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_Y8_reference>);
+
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_RGB888_reference>);
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_XRGB8888_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_RGB888_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_YUYV_to_XRGB8888_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_XRGB1555_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_RGB565_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_RGB888_reference>);
+ table.AddBlitter(kPixFormat_Y8, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_Y8_to_XRGB8888_reference>);
+
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_XVYU_reference>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_XVYU_reference>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_XVYU_reference>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_YUV444_XVYU, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_XVYU_reference>);
+
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB1555_to_Y8_reference>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB565_to_Y8_reference>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_RGB888_to_Y8_reference>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_Y8, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_XRGB8888_to_Y8_reference>);
+
+ table.AddBlitter(kPixFormat_YUV411_Planar, kPixFormat_YUV420_Planar, VDPixmapBlt_YUV411_to_YV12_reference);
+
+ table.AddBlitter(kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_YUYV_reference>);
+ table.AddBlitter(kPixFormat_YUV422_YUYV, kPixFormat_YUV422_UYVY, VDPixmapBlitterChunkyAdapter<VDPixmapBlt_UYVY_to_YUYV_reference>); // not an error -- same routine
+
+ //////////////////////////////////////////////////////////
+
+ VDPixmapFormatSubset srcFormats;
+ VDPixmapFormatSubset dstFormats;
+
+ srcFormats = kPixFormat_YUV444_Planar,
+ kPixFormat_YUV422_Planar,
+ kPixFormat_YUV420_Planar,
+ kPixFormat_YUV411_Planar,
+ kPixFormat_YUV410_Planar,
+ kPixFormat_YUV422_Planar_Centered,
+ kPixFormat_YUV420_Planar_Centered;
+
+ dstFormats = kPixFormat_XRGB1555, kPixFormat_RGB565, kPixFormat_RGB888, kPixFormat_XRGB8888, kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV;
+
+ table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_decode_reference);
+
+ //////////////////////////////////////////////////////////
+
+ dstFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+ srcFormats = kPixFormat_XRGB1555, kPixFormat_RGB565, kPixFormat_RGB888, kPixFormat_XRGB8888, kPixFormat_YUV422_UYVY, kPixFormat_YUV422_YUYV;
+
+ table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_encode_reference);
+
+ //////////////////////////////////////////////////////////
+
+ srcFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_Y8, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+ dstFormats = kPixFormat_YUV444_Planar, kPixFormat_YUV422_Planar, kPixFormat_YUV420_Planar, kPixFormat_YUV411_Planar, kPixFormat_YUV410_Planar, kPixFormat_Y8, kPixFormat_YUV422_Planar_Centered, kPixFormat_YUV420_Planar_Centered;
+
+ table.AddBlitter(srcFormats, dstFormats, VDPixmapBlt_YUVPlanar_convert_reference);
+}
+
+tpVDPixBltTable VDGetPixBltTableReferenceInternal() {
+ static VDPixmapBlitterTable sReferenceTable;
+
+ VDPixmapInitBlittersReference(sReferenceTable);
+
+ return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableReference() {
+ static tpVDPixBltTable spTable = VDGetPixBltTableReferenceInternal();
+
+ return spTable;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp
new file mode 100644
index 000000000..4a103de3b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_pal.cpp
@@ -0,0 +1,545 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#define DECLARE_PALETTED(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0)
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: Pal1 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal1, Any8) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 3;
+ dst += (w-1) & ~7;
+
+ srcpitch += (w+7) >> 3;
+ dstpitch += (w+7) & ~7;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> ((-wt) & 7);
+
+ switch(wt & 7) {
+ do {
+ v = src[0];
+
+ case 0: dst[7] = pal[v&1]; v >>= 1;
+ case 7: dst[6] = pal[v&1]; v >>= 1;
+ case 6: dst[5] = pal[v&1]; v >>= 1;
+ case 5: dst[4] = pal[v&1]; v >>= 1;
+ case 4: dst[3] = pal[v&1]; v >>= 1;
+ case 3: dst[2] = pal[v&1]; v >>= 1;
+ case 2: dst[1] = pal[v&1]; v >>= 1;
+ case 1: dst[0] = pal[v&1]; v >>= 1;
+
+ dst -= 8;
+ --src;
+ } while((wt -= 8) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any16) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+ const uint16 *pal = (const uint16 *)pal0;
+
+ src += (w-1) >> 3;
+ dst += (w-1) & ~7;
+
+ srcpitch += (w+7) >> 3;
+ dstpitch += ((w+7) & ~7) * 2;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> ((-wt) & 7);
+
+ switch(wt & 7) {
+ do {
+ v = src[0];
+
+ case 0: dst[7] = pal[v&1]; v >>= 1;
+ case 7: dst[6] = pal[v&1]; v >>= 1;
+ case 6: dst[5] = pal[v&1]; v >>= 1;
+ case 5: dst[4] = pal[v&1]; v >>= 1;
+ case 4: dst[3] = pal[v&1]; v >>= 1;
+ case 3: dst[2] = pal[v&1]; v >>= 1;
+ case 2: dst[1] = pal[v&1]; v >>= 1;
+ case 1: dst[0] = pal[v&1]; v >>= 1;
+
+ dst -= 8;
+ --src;
+ } while((wt -= 8) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any24) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 3;
+ dst += ((w-1) & ~7) * 3;
+
+ srcpitch += (w+7) >> 3;
+ dstpitch += ((w+7) & ~7) * 3;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> ((-wt) & 7);
+ const uint8 *pe;
+
+ switch(wt & 7) {
+ do {
+ v = src[0];
+
+ case 0: pe = &pal[3*(v&1)]; dst[7*3+0] = pe[0]; dst[7*3+1] = pe[1]; dst[7*3+2] = pe[2]; v >>= 1;
+ case 7: pe = &pal[3*(v&1)]; dst[6*3+0] = pe[0]; dst[6*3+1] = pe[1]; dst[6*3+2] = pe[2]; v >>= 1;
+ case 6: pe = &pal[3*(v&1)]; dst[5*3+0] = pe[0]; dst[5*3+1] = pe[1]; dst[5*3+2] = pe[2]; v >>= 1;
+ case 5: pe = &pal[3*(v&1)]; dst[4*3+0] = pe[0]; dst[4*3+1] = pe[1]; dst[4*3+2] = pe[2]; v >>= 1;
+ case 4: pe = &pal[3*(v&1)]; dst[3*3+0] = pe[0]; dst[3*3+1] = pe[1]; dst[3*3+2] = pe[2]; v >>= 1;
+ case 3: pe = &pal[3*(v&1)]; dst[2*3+0] = pe[0]; dst[2*3+1] = pe[1]; dst[2*3+2] = pe[2]; v >>= 1;
+ case 2: pe = &pal[3*(v&1)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 1;
+ case 1: pe = &pal[3*(v&1)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 1;
+
+ dst -= 24;
+ --src;
+ } while((wt -= 8) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal1, Any32) {
+ const uint8 *src = (const uint8 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *pal = (const uint32 *)pal0;
+
+ src += (w-1) >> 3;
+ dst += (w-1) & ~7;
+
+ srcpitch += (w+7) >> 3;
+ dstpitch += ((w+7) & ~7) * 4;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> ((-wt) & 7);
+
+ switch(wt & 7) {
+ do {
+ v = src[0];
+
+ case 0: dst[7] = pal[v&1]; v >>= 1;
+ case 7: dst[6] = pal[v&1]; v >>= 1;
+ case 6: dst[5] = pal[v&1]; v >>= 1;
+ case 5: dst[4] = pal[v&1]; v >>= 1;
+ case 4: dst[3] = pal[v&1]; v >>= 1;
+ case 3: dst[2] = pal[v&1]; v >>= 1;
+ case 2: dst[1] = pal[v&1]; v >>= 1;
+ case 1: dst[0] = pal[v&1]; v >>= 1;
+
+ dst -= 8;
+ --src;
+ } while((wt -= 8) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: Pal2 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal2, Any8) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 2;
+ dst += (w-1) & ~3;
+
+ srcpitch += (w+3) >> 2;
+ dstpitch += (w+3) & ~3;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 3)*2);
+
+ switch(wt & 3) {
+ do {
+ v = src[0];
+
+ case 0: dst[3] = pal[v&3]; v >>= 2;
+ case 3: dst[2] = pal[v&3]; v >>= 2;
+ case 2: dst[1] = pal[v&3]; v >>= 2;
+ case 1: dst[0] = pal[v&3]; v >>= 2;
+
+ dst -= 4;
+ --src;
+ } while((wt -= 4) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any16) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+ const uint16 *pal = (const uint16 *)pal0;
+
+ src += (w-1) >> 2;
+ dst += (w-1) & ~3;
+
+ srcpitch += (w+3) >> 2;
+ dstpitch += ((w+3) & ~3) * 2;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 3)*2);
+
+ switch(wt & 3) {
+ do {
+ v = src[0];
+
+ case 0: dst[3] = pal[v&3]; v >>= 2;
+ case 3: dst[2] = pal[v&3]; v >>= 2;
+ case 2: dst[1] = pal[v&3]; v >>= 2;
+ case 1: dst[0] = pal[v&3]; v >>= 2;
+
+ dst -= 4;
+ --src;
+ } while((wt -= 4) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any24) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 2;
+ dst += ((w-1) & ~3) * 3;
+
+ srcpitch += (w+3) >> 2;
+ dstpitch += ((w+3) & ~3) * 3;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 3)*2);
+ const uint8 *pe;
+
+ switch(wt & 3) {
+ do {
+ v = src[0];
+
+ case 0: pe = &pal[3*(v&3)]; dst[3*3+0] = pe[0]; dst[3*3+1] = pe[1]; dst[3*3+2] = pe[2]; v >>= 2;
+ case 3: pe = &pal[3*(v&3)]; dst[2*3+0] = pe[0]; dst[2*3+1] = pe[1]; dst[2*3+2] = pe[2]; v >>= 2;
+ case 2: pe = &pal[3*(v&3)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 2;
+ case 1: pe = &pal[3*(v&3)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 2;
+
+ dst -= 12;
+ --src;
+ } while((wt -= 4) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal2, Any32) {
+ const uint8 *src = (const uint8 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *pal = (const uint32 *)pal0;
+
+ src += (w-1) >> 2;
+ dst += (w-1) & ~3;
+
+ srcpitch += (w+3) >> 2;
+ dstpitch += ((w+3) & ~3) * 4;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 3)*2);
+
+ switch(wt & 3) {
+ do {
+ v = src[0];
+
+ case 0: dst[3] = pal[v&3]; v >>= 2;
+ case 3: dst[2] = pal[v&3]; v >>= 2;
+ case 2: dst[1] = pal[v&3]; v >>= 2;
+ case 1: dst[0] = pal[v&3]; v >>= 2;
+
+ dst -= 4;
+ --src;
+ } while((wt -= 4) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: Pal4 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal4, Any8) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 1;
+ dst += ((w-1) & ~1);
+
+ srcpitch += (w+1) >> 1;
+ dstpitch += (w+1) & ~1;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 1)*4);
+
+ switch(wt & 1) {
+ do {
+ v = src[0];
+
+ case 0: dst[1] = pal[v&15]; v >>= 4;
+ case 1: dst[0] = pal[v&15]; v >>= 4;
+
+ dst -= 2;
+ --src;
+ } while((wt -= 2) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any16) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+ const uint16 *pal = (const uint16 *)pal0;
+
+ src += (w-1) >> 1;
+ dst += ((w-1) & ~1);
+
+ srcpitch += (w+1) >> 1;
+ dstpitch += ((w+1) & ~1) * 2;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 1)*4);
+
+ switch(wt & 1) {
+ do {
+ v = src[0];
+
+ case 0: dst[1] = pal[v&15]; v >>= 4;
+ case 1: dst[0] = pal[v&15]; v >>= 4;
+
+ dst -= 2;
+ --src;
+ } while((wt -= 2) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any24) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ src += (w-1) >> 1;
+ dst += ((w-1) & ~1) * 3;
+
+ srcpitch += (w+1) >> 1;
+ dstpitch += ((w+1) & ~1) * 3;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 1)*4);
+ const uint8 *pe;
+
+ switch(wt & 1) {
+ do {
+ v = src[0];
+
+ case 0: pe = &pal[3*(v&15)]; dst[1*3+0] = pe[0]; dst[1*3+1] = pe[1]; dst[1*3+2] = pe[2]; v >>= 4;
+ case 1: pe = &pal[3*(v&15)]; dst[0*3+0] = pe[0]; dst[0*3+1] = pe[1]; dst[0*3+2] = pe[2]; v >>= 4;
+
+ dst -= 6;
+ --src;
+ } while((wt -= 2) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal4, Any32) {
+ const uint8 *src = (const uint8 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *pal = (const uint32 *)pal0;
+
+ src += (w-1) >> 1;
+ dst += ((w-1) & ~1);
+
+ srcpitch += (w+1) >> 1;
+ dstpitch += ((w+1) & ~1) * 4;
+
+ do {
+ int wt = w;
+
+ uint8 v = src[0] >> (((-wt) & 1)*4);
+
+ switch(wt & 1) {
+ do {
+ v = src[0];
+
+ case 0: dst[1] = pal[v&15]; v >>= 4;
+ case 1: dst[0] = pal[v&15]; v >>= 4;
+
+ dst -= 2;
+ --src;
+ } while((wt -= 2) > 0);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: Pal8 ->
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_PALETTED(Pal8, Any8) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ srcpitch -= w;
+ dstpitch -= w;
+
+ do {
+ int wt = w;
+
+ do {
+ *dst++ = pal[*src++];
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any16) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+ const uint16 *pal = (const uint16 *)pal0;
+
+ srcpitch -= w;
+ dstpitch -= w*2;
+
+ do {
+ int wt = w;
+
+ do {
+ *dst++ = pal[*src++];
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any24) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *pal = (const uint8 *)pal0;
+
+ srcpitch -= w;
+ dstpitch -= w*3;
+
+ do {
+ int wt = w;
+ do {
+ const uint8 *pe = &pal[3**src++];
+
+ dst[0] = pe[0];
+ dst[1] = pe[1];
+ dst[2] = pe[2];
+ dst += 3;
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_PALETTED(Pal8, Any32) {
+ const uint8 *src = (const uint8 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *pal = (const uint32 *)pal0;
+
+ srcpitch -= w;
+ dstpitch -= w*4;
+
+ do {
+ int wt = w;
+
+ do {
+ *dst++ = pal[*src++];
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp
new file mode 100644
index 000000000..ea49f260d
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_rgb.cpp
@@ -0,0 +1,310 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#define DECLARE_RGB(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: -> XRGB1555
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(RGB565, XRGB1555) {
+ const uint16 *src = (const uint16 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ *dst++ = (px&0x001f) + ((px&0xffc0)>>1);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(RGB888, XRGB1555) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 3*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 r = ((uint32)src[2] & 0xf8) << 7;
+ const uint32 g = ((uint32)src[1] & 0xf8) << 2;
+ const uint32 b = (uint32)src[0] >> 3;
+ src += 3;
+
+ *dst++ = (uint16)(r + g + b);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(XRGB8888, XRGB1555) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 4*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 r = ((uint32)src[2] & 0xf8) << 7;
+ const uint32 g = ((uint32)src[1] & 0xf8) << 2;
+ const uint32 b = (uint32)src[0] >> 3;
+ src += 4;
+
+ *dst++ = (uint16)(r + g + b);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: -> RGB565
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, RGB565) {
+ const uint16 *src = (const uint16 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ *dst++ = (uint16)(px + (px&0xffe0) + ((px&0x0200)>>4));
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(RGB888, RGB565) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 3*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 r = ((uint32)src[2] & 0xf8) << 8;
+ const uint32 g = ((uint32)src[1] & 0xfc) << 3;
+ const uint32 b = (uint32)src[0] >> 3;
+ src += 3;
+
+ *dst++ = (uint16)(r + g + b);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(XRGB8888, RGB565) {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ srcpitch -= 4*w;
+ dstpitch -= 2*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 r = ((uint32)src[2] & 0xf8) << 8;
+ const uint32 g = ((uint32)src[1] & 0xfc) << 3;
+ const uint32 b = (uint32)src[0] >> 3;
+ src += 4;
+
+ *dst++ = (uint16)(r + g + b);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: -> RGB888
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, RGB888) {
+ const uint16 *src = (const uint16 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 3*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ uint32 rb = px & 0x7c1f;
+ uint32 g = px & 0x03e0;
+
+ rb += rb<<5;
+ g += g<<5;
+
+ dst[0] = (uint8)(rb>>2);
+ dst[1] = (uint8)(g>>7);
+ dst[2] = (uint8)(rb>>12);
+ dst += 3;
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(RGB565, RGB888) {
+ const uint16 *src = (const uint16 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 3*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ uint32 rb = px & 0xf81f;
+ uint32 g = px & 0x07e0;
+
+ rb += rb<<5;
+ g += g<<6;
+
+ dst[0] = (uint8)(rb>>2);
+ dst[1] = (uint8)(g>>9);
+ dst[2] = (uint8)(rb>>13);
+ dst += 3;
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(XRGB8888, RGB888) {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ srcpitch -= 4*w;
+ dstpitch -= 3*w;
+
+ do {
+ int wt = w;
+
+ do {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst += 3;
+ src += 4;
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// RGB blitters: -> XRGB8888
+//
+///////////////////////////////////////////////////////////////////////////
+
+DECLARE_RGB(XRGB1555, XRGB8888) {
+ const uint16 *src = (const uint16 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 4*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ const uint32 rgb = ((px & 0x7c00) << 9) + ((px & 0x03e0) << 6) + ((px & 0x001f) << 3);
+
+ *dst++ = rgb + ((rgb & 0xe0e0e0)>>5);
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(RGB565, XRGB8888) {
+ const uint16 *src = (const uint16 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+
+ srcpitch -= 2*w;
+ dstpitch -= 4*w;
+
+ do {
+ int wt = w;
+
+ do {
+ const uint32 px = *src++;
+ const uint32 rb = ((px & 0xf800) << 8) + ((px & 0x001f) << 3);
+ const uint32 g = ((px & 0x07e0) << 5) + (px & 0x0300);
+
+ *dst++ = rb + ((rb & 0xe000e0)>>5) + g;
+ } while(--wt);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_RGB(RGB888, XRGB8888) {
+ const uint8 *src = (const uint8 *)src0;
+ uint32 *dst = (uint32 *)dst0;
+
+ srcpitch -= 3*w;
+ dstpitch -= 4*w;
+
+ do {
+ int wt = w;
+
+ do {
+ *dst++ = (uint32)src[0] + ((uint32)src[1]<<8) + ((uint32)src[2]<<16);
+ src += 3;
+ } while(--wt);
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp
new file mode 100644
index 000000000..6f40eeaa0
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv.cpp
@@ -0,0 +1,1590 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/memory.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#include "blt_spanutils.h"
+
+#ifdef _M_IX86
+ #include "blt_spanutils_x86.h"
+#endif
+
+using namespace nsVDPixmapSpanUtils;
+
+namespace {
+ struct YCbCrToRGB {
+ sint16 y_tab[256];
+ sint16 r_cr_tab[256];
+ sint16 b_cb_tab[256];
+ sint16 g_cr_tab[256];
+ sint16 g_cb_tab[256];
+ uint8 cliptab[277+256+279];
+ uint16 cliptab15[277+256+279];
+ uint16 cliptab16[277+256+279];
+
+ YCbCrToRGB() {
+ int i;
+
+ memset(cliptab, 0, 277);
+ memset(cliptab+277+256, 255, 279);
+
+ memset(cliptab15, 0, sizeof cliptab15[0] * 277);
+ memset(cliptab16, 0, sizeof cliptab16[0] * 277);
+ memset(cliptab15+277+256, 0xff, sizeof cliptab15[0] * 279);
+ memset(cliptab16+277+256, 0xff, sizeof cliptab16[0] * 279);
+
+ for(i=0; i<256; ++i) {
+ y_tab[i] = (sint16)(((i-16) * 76309 + 32768) >> 16);
+ r_cr_tab[i] = (sint16)(((i-128) * 104597 + 32768) >> 16);
+ b_cb_tab[i] = (sint16)(((i-128) * 132201 + 32768) >> 16);
+ g_cr_tab[i] = (sint16)(((i-128) * -53279 + 32768) >> 16);
+ g_cb_tab[i] = (sint16)(((i-128) * -25674 + 32768) >> 16);
+ cliptab[i+277] = (uint8)i;
+ cliptab15[i+277] = 0x421 * ((unsigned)i>>3);
+ cliptab16[i+277] = 0x801 * ((unsigned)i>>3) + 0x20 * ((unsigned)i>>2);
+ }
+ }
+ } colorconv;
+
+ struct YCbCrFormatInfo {
+ ptrdiff_t ystep;
+ ptrdiff_t cstep;
+ ptrdiff_t yinc[4];
+ ptrdiff_t cinc[4];
+ sint8 ypos[4];
+ sint8 cbpos[4];
+ sint8 crpos[4];
+ };
+
+ YCbCrFormatInfo g_formatInfo_YUV444_Planar = { -4, -4, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,1,2,3}, {0,1,2,3}, {0,1,2,3}};
+ YCbCrFormatInfo g_formatInfo_YUV422_YUYV = { -8, -8, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,2,4,6}, {1,1,5,5}, {3,3,7,7}};
+ YCbCrFormatInfo g_formatInfo_YUV422_UYVY = { -8, -8, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {1,3,5,7}, {0,0,4,4}, {2,2,6,6}};
+ YCbCrFormatInfo g_formatInfo_YUV420_YV12 = { -4, -2, {-1,-1,-1,-1}, { 0,-1, 0,-1}, {0,1,2,3}, {0,0,1,1}, {0,0,1,1}};
+ YCbCrFormatInfo g_formatInfo_YUV411_YV12 = { -4, -1, {-1,-1,-1,-1}, {-1,-1,-1,-1}, {0,1,2,3}, {0,0,0,0}, {0,0,0,0}};
+
+ inline uint16 ycbcr_to_1555(uint8 y, uint8 cb0, uint8 cr0) {
+ const uint16 *p = &colorconv.cliptab15[277 + colorconv.y_tab[y]];
+ uint32 r = 0x7c00 & p[colorconv.r_cr_tab[cr0]];
+ uint32 g = 0x03e0 & p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+ uint32 b = 0x001f & p[colorconv.b_cb_tab[cb0]];
+
+ return r + g + b;
+ }
+
+ inline uint16 ycbcr_to_565(uint8 y, uint8 cb0, uint8 cr0) {
+ const uint16 *p = &colorconv.cliptab16[277 + colorconv.y_tab[y]];
+ uint32 r = 0xf800 & p[colorconv.r_cr_tab[cr0]];
+ uint32 g = 0x07e0 & p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+ uint32 b = 0x001f & p[colorconv.b_cb_tab[cb0]];
+
+ return r + g + b;
+ }
+
+ inline void ycbcr_to_888(uint8 *dst, uint8 y, uint8 cb0, uint8 cr0) {
+ const uint8 *p = &colorconv.cliptab[277 + colorconv.y_tab[y]];
+ uint8 r = p[colorconv.r_cr_tab[cr0]];
+ uint8 g = p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+ uint8 b = p[colorconv.b_cb_tab[cb0]];
+
+ dst[0] = b;
+ dst[1] = g;
+ dst[2] = r;
+ }
+
+ inline uint32 ycbcr_to_8888(uint8 y, uint8 cb0, uint8 cr0) {
+ const uint8 *p = &colorconv.cliptab[277 + colorconv.y_tab[y]];
+ uint8 r = p[colorconv.r_cr_tab[cr0]];
+ uint8 g = p[colorconv.g_cr_tab[cr0] + colorconv.g_cb_tab[cb0]];
+ uint8 b = p[colorconv.b_cb_tab[cb0]];
+
+ return (r << 16) + (g << 8) + b;
+ }
+
+ void VDYCbCrToXRGB1555Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint16 *dst = (uint16 *)dst0;
+
+ do {
+ *dst++ = ycbcr_to_1555(*y++, *cb++, *cr++);
+ } while(--w);
+ }
+
+ void VDYCbCrToRGB565Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint16 *dst = (uint16 *)dst0;
+
+ do {
+ *dst++ = ycbcr_to_565(*y++, *cb++, *cr++);
+ } while(--w);
+ }
+
+ void VDYCbCrToRGB888Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint8 *dst = (uint8 *)dst0;
+
+ do {
+ ycbcr_to_888(dst, *y++, *cb++, *cr++);
+ dst += 3;
+ } while(--w);
+ }
+
+ void VDYCbCrToXRGB8888Span(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint32 *dst = (uint32 *)dst0;
+
+ do {
+ *dst++ = ycbcr_to_8888(*y++, *cb++, *cr++);
+ } while(--w);
+ }
+
+ void VDYCbCrToUYVYSpan(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint32 *dst = (uint32 *)dst0;
+
+ if (--w) {
+ do {
+ *dst++ = (uint32)*cb++ + ((uint32)y[0] << 8) + ((uint32)*cr++ << 16) + ((uint32)y[1] << 24);
+ y += 2;
+ } while((sint32)(w-=2)>0);
+ }
+
+ if (!(w & 1))
+ *dst++ = (uint32)*cb + ((uint32)y[0] << 8) + ((uint32)*cr << 16) + ((uint32)y[0] << 24);
+ }
+
+ void VDYCbCrToYUYVSpan(void *dst0, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 w) {
+ uint32 *dst = (uint32 *)dst0;
+
+ if (--w) {
+ do {
+ *dst++ = (uint32)y[0] + ((uint32)*cb++ << 8) + ((uint32)y[1] << 16) + ((uint32)*cr++ << 24);
+ y += 2;
+ } while((sint32)(w-=2)>0);
+ }
+
+ if (!(w & 1))
+ *dst++ = (uint32)y[0] + ((uint32)*cb << 8) + ((uint32)y[0] << 16) + ((uint32)*cr << 24);
+ }
+
+ void VDYCbCrToRGB1555Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+ const ptrdiff_t ystep = formatinfo.ystep;
+ const ptrdiff_t cstep = formatinfo.cstep;
+ const ptrdiff_t ypos0 = formatinfo.ypos[0];
+ const ptrdiff_t ypos1 = formatinfo.ypos[1];
+ const ptrdiff_t ypos2 = formatinfo.ypos[2];
+ const ptrdiff_t ypos3 = formatinfo.ypos[3];
+ const ptrdiff_t crpos0 = formatinfo.crpos[0];
+ const ptrdiff_t crpos1 = formatinfo.crpos[1];
+ const ptrdiff_t crpos2 = formatinfo.crpos[2];
+ const ptrdiff_t crpos3 = formatinfo.crpos[3];
+ const ptrdiff_t cbpos0 = formatinfo.cbpos[0];
+ const ptrdiff_t cbpos1 = formatinfo.cbpos[1];
+ const ptrdiff_t cbpos2 = formatinfo.cbpos[2];
+ const ptrdiff_t cbpos3 = formatinfo.cbpos[3];
+
+ yrow = (char *)yrow - ystep * ((w-1) >> 2);
+ crrow = (char *)crrow - cstep * ((w-1) >> 2);
+ cbrow = (char *)cbrow - cstep * ((w-1) >> 2);
+ dst = (char *)dst + 2*((w-1) & ~3);
+
+ int y = 0;
+ do {
+ const uint8 *ysrc = (const uint8 *)yrow;
+ const uint8 *crsrc = (const uint8 *)crrow;
+ const uint8 *cbsrc = (const uint8 *)cbrow;
+ uint16 *out = (uint16 *)dst;
+ int w2 = -w;
+
+ switch(w2 & 3) {
+ do {
+ case 0: out[3] = ycbcr_to_1555(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+ case 1: out[2] = ycbcr_to_1555(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+ case 2: out[1] = ycbcr_to_1555(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+ case 3: out[0] = ycbcr_to_1555(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+ out -= 4;
+ ysrc += ystep;
+ crsrc += cstep;
+ cbsrc += cstep;
+ } while((w2 += 4) < 0);
+ }
+
+ dst = (char *)dst + dststride;
+ yrow = (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+ cbrow = (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+ crrow = (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+ } while(++y < h);
+ }
+
+ void VDYCbCrToRGB565Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+ const ptrdiff_t ystep = formatinfo.ystep;
+ const ptrdiff_t cstep = formatinfo.cstep;
+ const ptrdiff_t ypos0 = formatinfo.ypos[0];
+ const ptrdiff_t ypos1 = formatinfo.ypos[1];
+ const ptrdiff_t ypos2 = formatinfo.ypos[2];
+ const ptrdiff_t ypos3 = formatinfo.ypos[3];
+ const ptrdiff_t crpos0 = formatinfo.crpos[0];
+ const ptrdiff_t crpos1 = formatinfo.crpos[1];
+ const ptrdiff_t crpos2 = formatinfo.crpos[2];
+ const ptrdiff_t crpos3 = formatinfo.crpos[3];
+ const ptrdiff_t cbpos0 = formatinfo.cbpos[0];
+ const ptrdiff_t cbpos1 = formatinfo.cbpos[1];
+ const ptrdiff_t cbpos2 = formatinfo.cbpos[2];
+ const ptrdiff_t cbpos3 = formatinfo.cbpos[3];
+
+ yrow = (char *)yrow - ystep * ((w-1) >> 2);
+ crrow = (char *)crrow - cstep * ((w-1) >> 2);
+ cbrow = (char *)cbrow - cstep * ((w-1) >> 2);
+ dst = (char *)dst + 2*((w-1) & ~3);
+
+ int y = 0;
+ do {
+ const uint8 *ysrc = (const uint8 *)yrow;
+ const uint8 *crsrc = (const uint8 *)crrow;
+ const uint8 *cbsrc = (const uint8 *)cbrow;
+ uint16 *out = (uint16 *)dst;
+ int w2 = -w;
+
+ switch(w2 & 3) {
+ do {
+ case 0: out[3] = ycbcr_to_565(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+ case 1: out[2] = ycbcr_to_565(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+ case 2: out[1] = ycbcr_to_565(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+ case 3: out[0] = ycbcr_to_565(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+ out -= 4;
+ ysrc += ystep;
+ crsrc += cstep;
+ cbsrc += cstep;
+ } while((w2 += 4) < 0);
+ }
+
+ dst = (char *)dst + dststride;
+ yrow = (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+ cbrow = (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+ crrow = (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+ } while(++y < h);
+ }
+
+ void VDYCbCrToRGB888Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+ const ptrdiff_t ystep = formatinfo.ystep;
+ const ptrdiff_t cstep = formatinfo.cstep;
+ const ptrdiff_t ypos0 = formatinfo.ypos[0];
+ const ptrdiff_t ypos1 = formatinfo.ypos[1];
+ const ptrdiff_t ypos2 = formatinfo.ypos[2];
+ const ptrdiff_t ypos3 = formatinfo.ypos[3];
+ const ptrdiff_t crpos0 = formatinfo.crpos[0];
+ const ptrdiff_t crpos1 = formatinfo.crpos[1];
+ const ptrdiff_t crpos2 = formatinfo.crpos[2];
+ const ptrdiff_t crpos3 = formatinfo.crpos[3];
+ const ptrdiff_t cbpos0 = formatinfo.cbpos[0];
+ const ptrdiff_t cbpos1 = formatinfo.cbpos[1];
+ const ptrdiff_t cbpos2 = formatinfo.cbpos[2];
+ const ptrdiff_t cbpos3 = formatinfo.cbpos[3];
+
+ yrow = (char *)yrow - ystep * ((w-1) >> 2);
+ crrow = (char *)crrow - cstep * ((w-1) >> 2);
+ cbrow = (char *)cbrow - cstep * ((w-1) >> 2);
+ dst = (char *)dst + 3*((w-1) & ~3);
+
+ int y = 0;
+ do {
+ const uint8 *ysrc = (const uint8 *)yrow;
+ const uint8 *crsrc = (const uint8 *)crrow;
+ const uint8 *cbsrc = (const uint8 *)cbrow;
+ uint8 *out = (uint8 *)dst;
+ int w2 = -w;
+
+ switch(w2 & 3) {
+ do {
+ case 0: ycbcr_to_888(out+9, ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+ case 1: ycbcr_to_888(out+6, ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+ case 2: ycbcr_to_888(out+3, ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+ case 3: ycbcr_to_888(out, ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+ out -= 12;
+ ysrc += ystep;
+ crsrc += cstep;
+ cbsrc += cstep;
+ } while((w2 += 4) < 0);
+ }
+
+ dst = (char *)dst + dststride;
+ yrow = (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+ cbrow = (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+ crrow = (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+ } while(++y < h);
+ }
+
+ void VDYCbCrToRGB8888Generic(void *dst, ptrdiff_t dststride, const void *yrow, ptrdiff_t ystride, const void *cbrow, ptrdiff_t cbstride, const void *crrow, ptrdiff_t crstride, int w, int h, const YCbCrFormatInfo& formatinfo) {
+ const ptrdiff_t ystep = formatinfo.ystep;
+ const ptrdiff_t cstep = formatinfo.cstep;
+ const ptrdiff_t ypos0 = formatinfo.ypos[0];
+ const ptrdiff_t ypos1 = formatinfo.ypos[1];
+ const ptrdiff_t ypos2 = formatinfo.ypos[2];
+ const ptrdiff_t ypos3 = formatinfo.ypos[3];
+ const ptrdiff_t crpos0 = formatinfo.crpos[0];
+ const ptrdiff_t crpos1 = formatinfo.crpos[1];
+ const ptrdiff_t crpos2 = formatinfo.crpos[2];
+ const ptrdiff_t crpos3 = formatinfo.crpos[3];
+ const ptrdiff_t cbpos0 = formatinfo.cbpos[0];
+ const ptrdiff_t cbpos1 = formatinfo.cbpos[1];
+ const ptrdiff_t cbpos2 = formatinfo.cbpos[2];
+ const ptrdiff_t cbpos3 = formatinfo.cbpos[3];
+
+ yrow = (char *)yrow - ystep * ((w-1) >> 2);
+ crrow = (char *)crrow - cstep * ((w-1) >> 2);
+ cbrow = (char *)cbrow - cstep * ((w-1) >> 2);
+ dst = (char *)dst + 4*((w-1) & ~3);
+
+ int y = 0;
+ do {
+ const uint8 *ysrc = (const uint8 *)yrow;
+ const uint8 *crsrc = (const uint8 *)crrow;
+ const uint8 *cbsrc = (const uint8 *)cbrow;
+ uint32 *out = (uint32 *)dst;
+ int w2 = -w;
+
+ switch(w2 & 3) {
+ do {
+ case 0: out[3] = ycbcr_to_8888(ysrc[ypos3], cbsrc[cbpos3], crsrc[crpos3]);
+ case 1: out[2] = ycbcr_to_8888(ysrc[ypos2], cbsrc[cbpos2], crsrc[crpos2]);
+ case 2: out[1] = ycbcr_to_8888(ysrc[ypos1], cbsrc[cbpos1], crsrc[crpos1]);
+ case 3: out[0] = ycbcr_to_8888(ysrc[ypos0], cbsrc[cbpos0], crsrc[crpos0]);
+ out -= 4;
+ ysrc += ystep;
+ crsrc += cstep;
+ cbsrc += cstep;
+ } while((w2 += 4) < 0);
+ }
+
+ dst = (char *)dst + dststride;
+ yrow = (const char *)yrow + (ystride & formatinfo.yinc[y & 3]);
+ cbrow = (const char *)cbrow + (cbstride & formatinfo.cinc[y & 3]);
+ crrow = (const char *)crrow + (crstride & formatinfo.cinc[y & 3]);
+ } while(++y < h);
+ }
+}
+
+#define DECLARE_YUV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+DECLARE_YUV(UYVY, XRGB1555) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint16 *y;
+
+ cb = src[0];
+ cr = src[2];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[1]]];
+ *dst++ = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[4];
+ cr = src[6];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[3]]];
+ dst[0] = (y[(rc0+rc1+1)>>1] & 0x7c00) + (y[(gc0+gc1+1)>>1] & 0x3e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[5]]];
+ dst[1] = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+ dst += 2;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[3]]];
+ *dst = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(UYVY, RGB565) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint16 *y;
+
+ cb = src[0];
+ cr = src[2];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[1]]];
+ *dst++ = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[4];
+ cr = src[6];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[3]]];
+ dst[0] = (y[(rc0+rc1+1)>>1] & 0xf800) + (y[(gc0+gc1+1)>>1] & 0x7e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[5]]];
+ dst[1] = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+ dst += 2;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[3]]];
+ *dst = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(UYVY, RGB888) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint8 *y;
+
+ cb = src[0];
+ cr = src[2];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[1]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ dst += 3;
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[4];
+ cr = src[6];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+ dst[0] = y[(bc0+bc1+1)>>1];
+ dst[1] = y[(gc0+gc1+1)>>1];
+ dst[2] = y[(rc0+rc1+1)>>1];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[5]]];
+ dst[3] = y[bc1];
+ dst[4] = y[gc1];
+ dst[5] = y[rc1];
+
+ dst += 6;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(UYVY, XRGB8888) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint8 *y;
+
+ cb = src[0];
+ cr = src[2];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[1]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ dst += 4;
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[4];
+ cr = src[6];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+ dst[0] = y[(bc0+bc1+1)>>1];
+ dst[1] = y[(gc0+gc1+1)>>1];
+ dst[2] = y[(rc0+rc1+1)>>1];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[5]]];
+ dst[4] = y[bc1];
+ dst[5] = y[gc1];
+ dst[6] = y[rc1];
+
+ dst += 8;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[3]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(YUYV, XRGB1555) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint16 *y;
+
+ cb = src[1];
+ cr = src[3];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[0]]];
+ *dst++ = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[5];
+ cr = src[7];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[2]]];
+ dst[0] = (y[(rc0+rc1+1)>>1] & 0x7c00) + (y[(gc0+gc1+1)>>1] & 0x3e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[4]]];
+ dst[1] = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+
+ dst += 2;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab15[277 + colorconv.y_tab[src[2]]];
+ *dst = (y[rc1] & 0x7c00) + (y[gc1] & 0x3e0) + (y[bc1] & 0x001f);
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(YUYV, RGB565) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint16 *dst = (uint16 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint16 *y;
+
+ cb = src[1];
+ cr = src[3];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[0]]];
+ *dst++ = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[5];
+ cr = src[7];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[2]]];
+ dst[0] = (y[(rc0+rc1+1)>>1] & 0xf800) + (y[(gc0+gc1+1)>>1] & 0x7e0) + (y[(bc0+bc1+1)>>1] & 0x001f);
+
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[4]]];
+ dst[1] = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+
+ dst += 2;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab16[277 + colorconv.y_tab[src[2]]];
+ *dst = (y[rc1] & 0xf800) + (y[gc1] & 0x7e0) + (y[bc1] & 0x001f);
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(YUYV, RGB888) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint8 *y;
+
+ cb = src[1];
+ cr = src[3];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[0]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ dst += 3;
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[5];
+ cr = src[7];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+ dst[0] = y[(bc0+bc1+1)>>1];
+ dst[1] = y[(gc0+gc1+1)>>1];
+ dst[2] = y[(rc0+rc1+1)>>1];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[4]]];
+ dst[3] = y[bc1];
+ dst[4] = y[gc1];
+ dst[5] = y[rc1];
+
+ dst += 6;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(YUYV, XRGB8888) {
+ do {
+ const uint8 *src = (const uint8 *)src0;
+ uint8 *dst = (uint8 *)dst0;
+
+ // convert first pixel
+ int cb, cr;
+ int rc0, gc0, bc0, rc1, gc1, bc1;
+ const uint8 *y;
+
+ cb = src[1];
+ cr = src[3];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[0]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ dst += 4;
+
+ // convert pairs of pixels
+ int w2 = w;
+
+ if ((w2 -= 2) > 0) {
+ do {
+ rc0 = rc1;
+ gc0 = gc1;
+ bc0 = bc1;
+
+ cb = src[5];
+ cr = src[7];
+ rc1 = colorconv.r_cr_tab[cr];
+ gc1 = colorconv.g_cr_tab[cr] + colorconv.g_cb_tab[cb];
+ bc1 = colorconv.b_cb_tab[cb];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+ dst[0] = y[(bc0+bc1+1)>>1];
+ dst[1] = y[(gc0+gc1+1)>>1];
+ dst[2] = y[(rc0+rc1+1)>>1];
+
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[4]]];
+ dst[4] = y[bc1];
+ dst[5] = y[gc1];
+ dst[6] = y[rc1];
+
+ dst += 8;
+ src += 4;
+ } while((w2 -= 2) > 0);
+ }
+
+ // handle oddballs
+ if (!(w2 & 1)) {
+ y = &colorconv.cliptab[277 + colorconv.y_tab[src[2]]];
+ dst[0] = y[bc1];
+ dst[1] = y[gc1];
+ dst[2] = y[rc1];
+ }
+
+ vdptrstep(src0, srcpitch);
+ vdptrstep(dst0, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, XRGB1555) {
+ uint16 *dst = (uint16 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 2*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ *dst++ = colorconv.cliptab15[colorconv.y_tab[*src++] + 277];
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, RGB565) {
+ uint16 *dst = (uint16 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 2*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ *dst++ = colorconv.cliptab16[colorconv.y_tab[*src++] + 277];
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, RGB888) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 3*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ dst[0] = dst[1] = dst[2] = colorconv.cliptab[colorconv.y_tab[*src++] + 277];
+ dst += 3;
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, XRGB8888) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 4*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ *dst++ = 0x010101 * colorconv.cliptab[colorconv.y_tab[*src++] + 277];
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+#define DECLARE_YUV_PLANAR(x, y) void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+
+namespace {
+ typedef void (*tpYUVPlanarFinalDecoder)(void *, const uint8 *, const uint8 *, const uint8 *, uint32);
+ typedef void (*tpYUVPlanarHorizDecoder)(uint8 *dst, const uint8 *src, sint32 w);
+ typedef void (*tpYUVPlanarVertDecoder)(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase);
+}
+
+#ifdef _M_IX86
+ extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+ extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+ extern "C" void __cdecl vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX(void *dst, const uint8 *y, const uint8 *cb, const uint8 *cr, uint32 count);
+#endif
+
+
+void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+ int hbits = srcinfo.auxwbits;
+ int vbits = srcinfo.auxhbits;
+
+ if (src.format == nsVDPixmap::kPixFormat_YUV422_UYVY || src.format == nsVDPixmap::kPixFormat_YUV422_YUYV)
+ hbits = 1;
+
+ bool h_coaligned = true;
+ bool v_coaligned = false;
+
+ if (src.format == nsVDPixmap::kPixFormat_YUV422_Planar_Centered ||
+ src.format == nsVDPixmap::kPixFormat_YUV420_Planar_Centered) {
+ h_coaligned = false;
+ }
+
+ tpYUVPlanarVertDecoder vfunc = NULL;
+ tpYUVPlanarHorizDecoder hfunc = NULL;
+ uint32 horiz_buffer_size = 0;
+ uint32 vert_buffer_size = 0;
+ uint32 horiz_count = 0;
+ sint32 yaccum = 8;
+ sint32 yinc = 8;
+ uint32 yleft = h;
+
+ switch(vbits*2+v_coaligned) {
+ case 0: // 4:4:4, 4:2:2
+ case 1:
+ break;
+ case 2: // 4:2:0 (centered)
+ vfunc = vert_expand2x_centered;
+ vert_buffer_size = w>>1;
+ yaccum = 6;
+ yinc = 4;
+ yleft >>= 1;
+ break;
+ case 4: // 4:1:0 (centered)
+ vfunc = vert_expand4x_centered;
+ vert_buffer_size = w>>2;
+ yaccum = 5;
+ yinc = 2;
+ yleft >>= 2;
+ break;
+ default:
+ VDNEVERHERE;
+ return;
+ }
+
+ --yleft;
+
+ tpYUVPlanarFinalDecoder dfunc = NULL;
+
+#ifdef _M_IX86
+ uint32 cpuflags = CPUGetEnabledExtensions();
+
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+ if (vfunc == vert_expand2x_centered)
+ vfunc = vert_expand2x_centered_ISSE;
+ }
+
+ switch(dst.format) {
+ case nsVDPixmap::kPixFormat_XRGB1555: dfunc = vdasm_pixblt_YUV444Planar_to_XRGB1555_scan_MMX; break;
+ case nsVDPixmap::kPixFormat_RGB565: dfunc = vdasm_pixblt_YUV444Planar_to_RGB565_scan_MMX; break;
+ case nsVDPixmap::kPixFormat_XRGB8888: dfunc = vdasm_pixblt_YUV444Planar_to_XRGB8888_scan_MMX; break;
+ }
+ }
+#endif
+
+ bool halfchroma = false;
+
+ if (!dfunc) {
+ switch(dst.format) {
+ case nsVDPixmap::kPixFormat_XRGB1555: dfunc = VDYCbCrToXRGB1555Span; break;
+ case nsVDPixmap::kPixFormat_RGB565: dfunc = VDYCbCrToRGB565Span; break;
+ case nsVDPixmap::kPixFormat_RGB888: dfunc = VDYCbCrToRGB888Span; break;
+ case nsVDPixmap::kPixFormat_XRGB8888: dfunc = VDYCbCrToXRGB8888Span; break;
+ case nsVDPixmap::kPixFormat_YUV422_UYVY: dfunc = VDYCbCrToUYVYSpan; halfchroma = true; break;
+ case nsVDPixmap::kPixFormat_YUV422_YUYV: dfunc = VDYCbCrToYUYVSpan; halfchroma = true; break;
+ default:
+ VDNEVERHERE;
+ return;
+ }
+ }
+
+ switch(hbits*2+h_coaligned) {
+ case 0: // 4:4:4
+ case 1:
+ if (halfchroma) {
+ hfunc = horiz_compress2x_coaligned;
+ horiz_buffer_size = (w + 1) >> 1;
+ horiz_count = w;
+ }
+ break;
+ case 2: // 4:2:0 MPEG-1 (centered)
+ if (halfchroma) {
+ hfunc = horiz_realign_to_coaligned;
+ horiz_buffer_size = (w + 1) >> 1;
+ horiz_count = (w + 1) >> 1;
+ } else {
+ hfunc = horiz_expand2x_centered;
+ horiz_buffer_size = w;
+ horiz_count = w;
+ }
+ break;
+ case 3: // 4:2:0/4:2:2 MPEG-2 (coaligned)
+ if (!halfchroma) {
+ hfunc = horiz_expand2x_coaligned;
+ horiz_buffer_size = w;
+ horiz_count = w;
+ }
+ break;
+ case 5: // 4:1:1 (coaligned)
+ if (halfchroma) {
+ hfunc = horiz_expand2x_coaligned;
+ horiz_buffer_size = (w + 1) >> 1;
+ horiz_count = (w + 1) >> 1;
+ } else {
+ hfunc = horiz_expand4x_coaligned;
+ horiz_buffer_size = w;
+ horiz_count = w;
+ }
+ break;
+
+ default:
+ VDNEVERHERE;
+ return;
+ }
+
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+ if (hfunc == horiz_expand2x_coaligned)
+ hfunc = horiz_expand2x_coaligned_ISSE;
+ }
+#endif
+
+ uint32 chroma_srcwidth = -(-w >> srcinfo.auxwbits);
+ horiz_buffer_size = (horiz_buffer_size + 15) & ~15;
+ vert_buffer_size = (vert_buffer_size + 15) & ~15;
+
+ // allocate buffers
+
+ vdblock<uint8> tempbuf((horiz_buffer_size + vert_buffer_size)*2 + 1);
+
+ uint8 *const crbufh = tempbuf.data();
+ uint8 *const crbufv = crbufh + horiz_buffer_size;
+ uint8 *const cbbufh = crbufv + vert_buffer_size;
+ uint8 *const cbbufv = cbbufh + horiz_buffer_size;
+
+ const uint8 *cb0 = (const uint8*)src.data2;
+ const uint8 *cr0 = (const uint8*)src.data3;
+ const uint8 *cb1 = cb0;
+ const uint8 *cr1 = cr0;
+ const uint8 *y = (const uint8 *)src.data;
+ const ptrdiff_t ypitch = src.pitch;
+ const ptrdiff_t cbpitch = src.pitch2;
+ const ptrdiff_t crpitch = src.pitch3;
+
+ void *out = dst.data;
+ ptrdiff_t outpitch = dst.pitch;
+
+ for(;;) {
+ if (yaccum >= 8) {
+ yaccum &= 7;
+
+ cb0 = cb1;
+ cr0 = cr1;
+
+ if (yleft > 0) {
+ --yleft;
+ vdptrstep(cb1, cbpitch);
+ vdptrstep(cr1, crpitch);
+ }
+ }
+
+ const uint8 *cr = cr0;
+ const uint8 *cb = cb0;
+
+ // vertical interpolation: cr
+ if(yaccum & 7) {
+ const uint8 *const srcs[2]={cr0, cr1};
+ vfunc(crbufv, srcs, chroma_srcwidth, (yaccum & 7) << 5);
+ cr = crbufv;
+ }
+
+ // horizontal interpolation: cr
+ if (hfunc) {
+ hfunc(crbufh, cr, horiz_count);
+ cr = crbufh;
+ }
+
+ // vertical interpolation: cb
+ if(yaccum & 7) {
+ const uint8 *const srcs[2]={cb0, cb1};
+ vfunc(cbbufv, srcs, chroma_srcwidth, (yaccum & 7) << 5);
+ cb = cbbufv;
+ }
+
+ // horizontal interpolation: cb
+ if (hfunc) {
+ hfunc(cbbufh, cb, horiz_count);
+ cb = cbbufh;
+ }
+
+ dfunc(out, y, cb, cr, w);
+ vdptrstep(out, outpitch);
+ vdptrstep(y, ypitch);
+
+ if (!--h)
+ break;
+
+ yaccum += yinc;
+ }
+
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ __asm emms
+ }
+#endif
+}
+
+namespace {
+ typedef void (*tpUVBltHorizDecoder)(uint8 *dst, const uint8 *src, sint32 w);
+ typedef void (*tpUVBltVertDecoder)(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase);
+
+ void uvplaneblt(uint8 *dst, ptrdiff_t dstpitch, int dstformat, const uint8 *src, ptrdiff_t srcpitch, int srcformat, vdpixsize w, vdpixsize h) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(srcformat);
+ const VDPixmapFormatInfo& dstinfo = VDPixmapGetInfo(dstformat);
+
+ int xshift = srcinfo.auxwbits - dstinfo.auxwbits;
+ int yshift = srcinfo.auxhbits - dstinfo.auxhbits;
+
+ tpUVBltHorizDecoder hfunc = NULL;
+ tpUVBltVertDecoder vfunc = NULL;
+
+ switch(xshift) {
+ case +2:
+ hfunc = horiz_expand4x_coaligned;
+ break;
+ case +1:
+ hfunc = horiz_expand2x_coaligned;
+ break;
+ case 0:
+ break;
+ case -1:
+ hfunc = horiz_compress2x_coaligned;
+ break;
+ case -2:
+ hfunc = horiz_compress4x_coaligned;
+ break;
+ default:
+ VDNEVERHERE;
+ return;
+ }
+
+#ifdef _M_IX86
+ uint32 cpuflags = CPUGetEnabledExtensions();
+
+ if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+ if (hfunc == horiz_expand2x_coaligned)
+ hfunc = horiz_expand2x_coaligned_ISSE;
+ }
+#endif
+
+ int winsize, winposnext, winstep;
+
+ switch(yshift) {
+ case +2:
+ vfunc = vert_expand4x_centered;
+ winsize = 2;
+ winposnext = 0xa0;
+ winstep = 0x40;
+ break;
+ case +1:
+ vfunc = vert_expand2x_centered;
+ winsize = 2;
+ winposnext = 0xc0;
+ winstep = 0x80;
+ break;
+ case 0:
+ winsize = 1;
+ winposnext = 0;
+ winstep = 0x100;
+ break;
+ case -1:
+ vfunc = vert_compress2x_centered;
+ winsize = 4;
+ winposnext = 0x200;
+ winstep = 0x200;
+ break;
+ case -2:
+ vfunc = vert_compress4x_centered;
+ winsize = 8;
+ winposnext = 0x500;
+ winstep = 0x400;
+ break;
+ default:
+ VDNEVERHERE;
+ return;
+ }
+
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+ if (vfunc == vert_expand2x_centered)
+ vfunc = vert_expand2x_centered_ISSE;
+ }
+#endif
+
+ int dsth = -(-h >> dstinfo.auxhbits);
+ int srch = -(-h >> srcinfo.auxhbits);
+ int dstw = -(-w >> dstinfo.auxwbits);
+ int w2 = -(-w >> std::min<int>(dstinfo.auxwbits, srcinfo.auxwbits));
+
+ int winpos = (winposnext>>8) - winsize;
+
+ const uint8 *window[16];
+
+ vdblock<uint8> tmpbuf;
+ ptrdiff_t tmppitch = (w+15) & ~15;
+
+ if (vfunc && hfunc)
+ tmpbuf.resize(tmppitch * winsize);
+
+ do {
+ int desiredpos = winposnext >> 8;
+
+ while(winpos < desiredpos) {
+ const uint8 *srcrow = vdptroffset(src, srcpitch * std::max<int>(0, std::min<int>(srch-1, ++winpos)));
+ int winoffset = (winpos-1) & (winsize-1);
+
+ if (hfunc) {
+ uint8 *dstrow = vfunc ? tmpbuf.data() + tmppitch * winoffset : dst;
+ hfunc(dstrow, srcrow, w2);
+ srcrow = dstrow;
+ }
+
+ window[winoffset] = window[winoffset + winsize] = srcrow;
+ }
+
+ if (vfunc)
+ vfunc(dst, window + (winpos & (winsize-1)), dstw, winposnext & 255);
+ else if (!hfunc)
+ memcpy(dst, window[winpos & (winsize-1)], dstw);
+
+ winposnext += winstep;
+ vdptrstep(dst, dstpitch);
+ } while(--dsth);
+
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ __asm emms
+ }
+#endif
+ }
+}
+
+void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dstpm, const VDPixmap& srcpm, vdpixsize w, vdpixsize h) {
+ VDMemcpyRect(dstpm.data, dstpm.pitch, srcpm.data, srcpm.pitch, dstpm.w, dstpm.h);
+
+ if (srcpm.format != nsVDPixmap::kPixFormat_Y8) {
+ if (dstpm.format != nsVDPixmap::kPixFormat_Y8) {
+ // YCbCr -> YCbCr
+ uvplaneblt((uint8 *)dstpm.data2, dstpm.pitch2, dstpm.format, (uint8 *)srcpm.data2, srcpm.pitch2, srcpm.format, w, h);
+ uvplaneblt((uint8 *)dstpm.data3, dstpm.pitch3, dstpm.format, (uint8 *)srcpm.data3, srcpm.pitch3, srcpm.format, w, h);
+ }
+ } else {
+ if (dstpm.format != nsVDPixmap::kPixFormat_Y8) {
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(dstpm.format);
+ VDMemset8Rect(dstpm.data2, dstpm.pitch2, 0x80, -(-w >> info.auxwbits), -(-h >> info.auxhbits));
+ VDMemset8Rect(dstpm.data3, dstpm.pitch3, 0x80, -(-w >> info.auxwbits), -(-h >> info.auxhbits));
+ }
+ }
+}
+
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_RGB565_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+extern "C" void vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_ISSE(void *dst, const void *y, const void *cb, const void *cr, unsigned count);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555) {
+ uint16 *out = (uint16 *)dst.data;
+ const ptrdiff_t opitch = dst.pitch;
+ const uint8 *yrow = (const uint8 *)src.data;
+ const uint8 *cbrow = (const uint8 *)src.data2;
+ const uint8 *crrow = (const uint8 *)src.data3;
+ const ptrdiff_t ypitch = src.pitch;
+ const ptrdiff_t cbpitch = src.pitch2;
+ const ptrdiff_t crpitch = src.pitch3;
+
+ vdpixsize wpairs = (w-1)>>2;
+ vdpixsize wleft = w - (wpairs<<2);
+
+ do {
+ uint16 *p = out;
+ const uint8 *y = yrow;
+ const uint8 *cb = cbrow;
+ const uint8 *cr = crrow;
+ vdpixsize wt;
+
+ if (wpairs > 0) {
+#ifdef _M_AMD64
+ wt = wpairs;
+
+ do {
+ const unsigned cb0 = cb[0];
+ const unsigned cb1 = cb[1];
+ const unsigned cr0 = cr[0];
+ const unsigned cr1 = cr[1];
+
+ p[0] = ycbcr_to_1555(y[0], cb0, cr0);
+ p[1] = ycbcr_to_1555(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+ p[2] = ycbcr_to_1555(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+ p[3] = ycbcr_to_1555(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+ y += 4;
+ p += 4;
+ ++cb;
+ ++cr;
+ } while(--wt);
+#else
+ vdasm_pixblt_YUV411Planar_to_XRGB1555_scan_ISSE(p, y, cb, cr, wpairs);
+ y += 4*wpairs;
+ cr += wpairs;
+ cb += wpairs;
+ p += 4*wpairs;
+#endif
+ }
+
+ if (wleft > 0) {
+ wt = wleft;
+
+ const uint8 cr0 = *cr;
+ const uint8 cb0 = *cb;
+
+ do {
+ *p++ = ycbcr_to_1555(*y++, cb0, cr0);
+ } while(--wt);
+ }
+
+ vdptrstep(out, opitch);
+ vdptrstep(yrow, ypitch);
+ vdptrstep(cbrow, cbpitch);
+ vdptrstep(crrow, crpitch);
+ } while(--h);
+
+#ifndef _M_AMD64
+ __asm emms
+#endif
+}
+
+DECLARE_YUV_PLANAR(YUV411, RGB565) {
+ uint16 *out = (uint16 *)dst.data;
+ const ptrdiff_t opitch = dst.pitch;
+ const uint8 *yrow = (const uint8 *)src.data;
+ const uint8 *cbrow = (const uint8 *)src.data2;
+ const uint8 *crrow = (const uint8 *)src.data3;
+ const ptrdiff_t ypitch = src.pitch;
+ const ptrdiff_t cbpitch = src.pitch2;
+ const ptrdiff_t crpitch = src.pitch3;
+
+ vdpixsize wpairs = (w-1)>>2;
+ vdpixsize wleft = w - (wpairs<<2);
+
+ do {
+ uint16 *p = out;
+ const uint8 *y = yrow;
+ const uint8 *cb = cbrow;
+ const uint8 *cr = crrow;
+ vdpixsize wt;
+
+ if (wpairs > 0) {
+#if _M_AMD64
+ wt = wpairs;
+
+ do {
+ const unsigned cb0 = cb[0];
+ const unsigned cb1 = cb[1];
+ const unsigned cr0 = cr[0];
+ const unsigned cr1 = cr[1];
+
+ p[0] = ycbcr_to_565(y[0], cb0, cr0);
+ p[1] = ycbcr_to_565(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+ p[2] = ycbcr_to_565(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+ p[3] = ycbcr_to_565(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+ y += 4;
+ p += 4;
+ ++cb;
+ ++cr;
+ } while(--wt);
+#else
+ vdasm_pixblt_YUV411Planar_to_RGB565_scan_ISSE(p, y, cb, cr, wpairs);
+#endif
+ }
+
+ if (wleft > 0) {
+ wt = wleft;
+
+ const uint8 cr0 = *cr;
+ const uint8 cb0 = *cb;
+
+ do {
+ *p++ = ycbcr_to_565(*y++, cb0, cr0);
+ } while(--wt);
+ }
+
+ vdptrstep(out, opitch);
+ vdptrstep(yrow, ypitch);
+ vdptrstep(cbrow, cbpitch);
+ vdptrstep(crrow, crpitch);
+ } while(--h);
+
+#ifndef _M_AMD64
+ __asm emms
+#endif
+}
+
+DECLARE_YUV_PLANAR(YUV411, RGB888) {
+ uint8 *out = (uint8 *)dst.data;
+ const ptrdiff_t opitch = dst.pitch;
+ const uint8 *yrow = (const uint8 *)src.data;
+ const uint8 *cbrow = (const uint8 *)src.data2;
+ const uint8 *crrow = (const uint8 *)src.data3;
+ const ptrdiff_t ypitch = src.pitch;
+ const ptrdiff_t cbpitch = src.pitch2;
+ const ptrdiff_t crpitch = src.pitch3;
+
+ vdpixsize wpairs = (w-1)>>2;
+ vdpixsize wleft = w - (wpairs<<2);
+
+ do {
+ uint8 *p = out;
+ const uint8 *y = yrow;
+ const uint8 *cb = cbrow;
+ const uint8 *cr = crrow;
+ vdpixsize wt;
+
+ if (wpairs > 0) {
+ wt = wpairs;
+
+ do {
+ const unsigned cb0 = cb[0];
+ const unsigned cb1 = cb[1];
+ const unsigned cr0 = cr[0];
+ const unsigned cr1 = cr[1];
+
+ ycbcr_to_888(p+0, y[0], cb0, cr0);
+ ycbcr_to_888(p+3, y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+ ycbcr_to_888(p+6, y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+ ycbcr_to_888(p+9, y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+ y += 4;
+ p += 12;
+ ++cb;
+ ++cr;
+ } while(--wt);
+ }
+
+ if (wleft > 0) {
+ wt = wleft;
+
+ const uint8 cr0 = *cr;
+ const uint8 cb0 = *cb;
+
+ do {
+ ycbcr_to_888(p, *y++, cb0, cr0);
+ p += 4;
+ } while(--wt);
+ }
+
+ vdptrstep(out, opitch);
+ vdptrstep(yrow, ypitch);
+ vdptrstep(cbrow, cbpitch);
+ vdptrstep(crrow, crpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_PLANAR(YUV411, XRGB8888) {
+ uint32 *out = (uint32 *)dst.data;
+ const ptrdiff_t opitch = dst.pitch;
+ const uint8 *yrow = (const uint8 *)src.data;
+ const uint8 *cbrow = (const uint8 *)src.data2;
+ const uint8 *crrow = (const uint8 *)src.data3;
+ const ptrdiff_t ypitch = src.pitch;
+ const ptrdiff_t cbpitch = src.pitch2;
+ const ptrdiff_t crpitch = src.pitch3;
+
+ vdpixsize wpairs = (w-1)>>2;
+ vdpixsize wleft = w - (wpairs<<2);
+
+ do {
+ uint32 *p = out;
+ const uint8 *y = yrow;
+ const uint8 *cb = cbrow;
+ const uint8 *cr = crrow;
+ vdpixsize wt;
+
+ if (wpairs > 0) {
+#ifdef _M_AMD64
+ wt = wpairs;
+
+ do {
+ const unsigned cb0 = cb[0];
+ const unsigned cb1 = cb[1];
+ const unsigned cr0 = cr[0];
+ const unsigned cr1 = cr[1];
+
+ p[0] = ycbcr_to_8888(y[0], cb0, cr0);
+ p[1] = ycbcr_to_8888(y[1], (3*cb0+cb1+2)>>2, (3*cr0+cr1+2)>>2);
+ p[2] = ycbcr_to_8888(y[2], (cb0+cb1+1)>>1, (cr0+cr1+1)>>1);
+ p[3] = ycbcr_to_8888(y[3], (cb0+3*cb1+2)>>2, (cr0+3*cr1+2)>>2);
+
+ y += 4;
+ p += 4;
+ ++cb;
+ ++cr;
+ } while(--wt);
+#else
+ vdasm_pixblt_YUV411Planar_to_XRGB8888_scan_MMX(p, y, cb, cr, wpairs);
+ y += 4*wpairs;
+ cr += wpairs;
+ cb += wpairs;
+ p += 4*wpairs;
+#endif
+ }
+
+ if (wleft > 0) {
+ wt = wleft;
+
+ const uint8 cr0 = *cr;
+ const uint8 cb0 = *cb;
+
+ do {
+ *p++ = ycbcr_to_8888(*y++, cb0, cr0);
+ } while(--wt);
+ }
+
+ vdptrstep(out, opitch);
+ vdptrstep(yrow, ypitch);
+ vdptrstep(cbrow, cbpitch);
+ vdptrstep(crrow, crpitch);
+ } while(--h);
+
+#ifndef _M_AMD64
+ __asm emms
+#endif
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp
new file mode 100644
index 000000000..b581e9bf7
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuv2yuv.cpp
@@ -0,0 +1,260 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+#include "bitutils.h"
+#include "blt_spanutils.h"
+
+#define DECLARE_YUV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h)
+
+using namespace nsVDPixmapBitUtils;
+using namespace nsVDPixmapSpanUtils;
+
+DECLARE_YUV(XVYU, UYVY) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+
+ srcpitch -= (w&~1)*4;
+ dstpitch -= (w&~1)*2;
+
+ do {
+ vdpixsize wt = w;
+
+ wt = -wt;
+
+ if (++wt) {
+ uint32 a, b, c;
+
+ a = src[0];
+ b = src[1];
+ *dst++ = (avg_8888_121(a, a, b) & 0xff00ff) + (a & 0xff00) + ((b & 0xff00)<<16);
+ src += 2;
+
+ if ((wt+=2) < 0) {
+ do {
+ a = src[-1];
+ b = src[0];
+ c = src[1];
+
+ *dst++ = (avg_8888_121(a, b, c) & 0xff00ff) + (b & 0xff00) + ((c & 0xff00)<<16);
+ src += 2;
+ } while((wt+=2) < 0);
+ }
+ }
+
+ if (!(wt&1))
+ *dst = *src;
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(XVYU, YUYV) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+
+ srcpitch -= (w&~1)*4;
+ dstpitch -= (w&~1)*2;
+
+ do {
+ vdpixsize wt = w;
+
+ wt = -wt;
+
+ if (++wt) {
+ uint32 a, b, c;
+
+ a = src[0];
+ b = src[1];
+ *dst++ = ((avg_8888_121(a, a, b) & 0xff00ff)<<8) + ((a & 0xff00)>>8) + ((b & 0xff00)<<8);
+ src += 2;
+
+ if ((wt+=2)<0) {
+ do {
+ a = src[-1];
+ b = src[0];
+ c = src[1];
+
+ *dst++ = ((avg_8888_121(a, b, c) & 0xff00ff)<<8) + ((b & 0xff00)>>8) + ((c & 0xff00)<<8);
+ src += 2;
+ } while((wt+=2) < 0);
+ }
+ }
+
+ if (!(wt&1)) {
+ uint32 v = *src;
+ *dst = ((v&0xff00ff)<<8) + ((v&0xff00ff00)>>8);
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(UYVY, YUYV) { // also YUYV->UYVY
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+
+ w = (w+1) >> 1;
+
+ dstpitch -= 4*w;
+ srcpitch -= 4*w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ const uint32 p = *src++;
+
+ *dst++ = ((p & 0xff00ff00)>>8) + ((p & 0x00ff00ff)<<8);
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(UYVY, Y8) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= w;
+ srcpitch -= 2*w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ *dst++ = src[1];
+ src += 2;
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(YUYV, Y8) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= w;
+ srcpitch -= 2*w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ *dst++ = src[0];
+ src += 2;
+ } while(--w2);
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, UYVY) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 2*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ dst[0] = 0x80;
+ dst[1] = *src++;
+ dst += 2;
+ } while(--w2);
+
+ if (w & 1) {
+ dst[0] = 0x80;
+ dst[1] = dst[-1];
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV(Y8, YUYV) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ dstpitch -= 2*w;
+ srcpitch -= w;
+
+ do {
+ vdpixsize w2 = w;
+
+ do {
+ dst[0] = *src++;
+ dst[1] = 0x80;
+ dst += 2;
+ } while(--w2);
+
+ if (w & 1) {
+ dst[0] = dst[-1];
+ dst[1] = 0x80;
+ }
+
+ vdptrstep(src, srcpitch);
+ vdptrstep(dst, dstpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_PLANAR(YUV411, YV12) {
+ VDMemcpyRect(dst.data, dst.pitch, src.data, src.pitch, w, h);
+
+ vdblock<uint8> tmprow(w);
+ const uint8 *srcp = (const uint8 *)src.data2;
+ ptrdiff_t srcpitch = src.pitch2;
+ uint8 *dstp = (uint8 *)dst.data2;
+ ptrdiff_t dstpitch = dst.pitch2;
+ const uint8 *src1, *src2;
+
+ vdpixsize h2;
+ for(h2 = h; h2 > 0; h2 -= 2) {
+ src1 = srcp;
+ vdptrstep(srcp, srcpitch);
+ if (h2 > 1)
+ src2 = srcp;
+ else
+ src2 = src1;
+ vdptrstep(srcp, srcpitch);
+
+ const uint8 *sources[2] = {src1, src2};
+
+ vert_compress2x_centered_fast(tmprow.data(), sources, w, 0);
+ horiz_expand2x_coaligned(dstp, tmprow.data(), w);
+
+ vdptrstep(dstp, dstpitch);
+ }
+
+ srcp = (const uint8 *)src.data3;
+ srcpitch = src.pitch3;
+ dstp = (uint8 *)dst.data3;
+ dstpitch = dst.pitch3;
+ for(h2 = h; h2 > 0; h2 -= 2) {
+ src1 = srcp;
+ vdptrstep(srcp, srcpitch);
+ if (h2 > 1)
+ src2 = srcp;
+ else
+ src2 = src1;
+ vdptrstep(srcp, srcpitch);
+
+ const uint8 *sources[2] = {src1, src2};
+ vert_compress2x_centered_fast(tmprow.data(), sources, w, 0);
+ horiz_expand2x_coaligned(dstp, tmprow.data(), w);
+
+ vdptrstep(dstp, dstpitch);
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp
new file mode 100644
index 000000000..d6f38bf65
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_reference_yuvrev.cpp
@@ -0,0 +1,530 @@
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdtypes.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_spanutils.h"
+
+#ifdef _M_IX86
+ #include "blt_spanutils_x86.h"
+#endif
+
+using namespace nsVDPixmapSpanUtils;
+
+namespace {
+ // From Jim Blinn's "Dirty Pixels":
+ //
+ // Y = .299R + .587G + .114B
+ // Cr = 0.713(R-Y)
+ // Cb = 0.564(B-Y)
+ //
+ // IY = 219Y + 16 = ((yt = 1052IR + 2065IG + 401IB) + 67584) >> 12
+ // ICr = 224Cr + 128 = (yt*2987 - 10507932IR + 2155872256) >> 24
+ // ICb = 224Cb + 128 = (yt*2363 - 8312025IB + 2155872256) >> 24
+
+ void ConvertRGB32ToXVYU32(uint32 *dst, const uint8 *src, sint32 count) {
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ const sint32 yt = 1052*r + 2065*g + 401*b;
+ const sint32 y = (yt + 67584) >> 4; // <<8 alignment shift
+ const sint32 cr = (10507932*r - yt*2987 + 2155872256U) >> 8; // <<16 alignment shift
+ const sint32 cb = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+
+ *dst++ = (y&0xff00) + cb + (cr&0xff0000); // VYU order
+ src += 4;
+ } while(--count);
+ }
+
+ void ConvertRGB24ToXVYU32(uint32 *dst, const uint8 *src, sint32 count) {
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ const sint32 yt = 1052*r + 2065*g + 401*b;
+ const sint32 y = (yt + 67584) >> 4; // <<8 alignment shift
+ const sint32 cr = (10507932*r - yt*2987 + 2155872256U) >> 8; // <<16 alignment shift
+ const sint32 cb = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+
+ *dst++ = (y&0xff00) + cb + (cr&0xff0000); // VYU order
+ src += 3;
+ } while(--count);
+ }
+
+ void ConvertRGB16ToXVYU32(uint32 *dst, const uint16 *src, sint32 count) {
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0xf800) >> 11;
+ const sint32 g = (px & 0x07e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ const sint32 yt = 8652*r + 8358*g + 3299*b;
+ const sint32 y = (yt + 67584) >> 4; // <<8 alignment shift
+ const sint32 cr = (86436217*r - yt*2987 + 2155872256U) >> 8;
+ const sint32 cb = (68373108*b - yt*2363 + 2155872256U) >> 24; // <<16 alignment shift
+
+ *dst++ = (y&0xff00) + cb + (cr&0xff0000); // VYU order
+ } while(--count);
+ }
+
+ void ConvertRGB15ToXVYU32(uint32 *dst, const uint16 *src, sint32 count) {
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0x7c00) >> 10;
+ const sint32 g = (px & 0x03e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ const sint32 yt = 8652*r + 16986*g + 3299*b;
+ const sint32 y = (yt + 67584) >> 4; // <<8 alignment shift
+ const sint32 cr = (86436217*r - yt*2987 + 2155872256U) >> 8; // <<16 alignment shift
+ const sint32 cb = (68373108*b - yt*2363 + 2155872256U) >> 24;
+
+ *dst++ = (y&0xff00) + cb + (cr&0xff0000); // VYU order
+ } while(--count);
+ }
+
+ void ConvertRGB32ToY8(uint8 *dst, const uint8 *src, sint32 count) {
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ *dst++ = (uint8)((1052*r + 2065*g + 401*b + 67584) >> 12);
+ src += 4;
+ } while(--count);
+ }
+
+ void ConvertRGB24ToY8(uint8 *dst, const uint8 *src, sint32 count) {
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ *dst++ = (uint8)((1052*r + 2065*g + 401*b + 67584) >> 12);
+ src += 3;
+ } while(--count);
+ }
+
+ void ConvertRGB16ToY8(uint8 *dst, const uint16 *src, sint32 count) {
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0xf800) >> 11;
+ const sint32 g = (px & 0x07e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ *dst++ = (uint8)((8652*r + 8358*g + 3299*b + 67584) >> 12);
+ } while(--count);
+ }
+
+ void ConvertRGB15ToY8(uint8 *dst, const uint16 *src, sint32 count) {
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0x7c00) >> 10;
+ const sint32 g = (px & 0x03e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ *dst++ = (uint8)((8652*r + 16986*g + 3299*b + 67584) >> 12);
+ } while(--count);
+ }
+}
+
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+
+DECLARE_YUV_REV(XRGB1555, XVYU) {
+ do {
+ ConvertRGB15ToXVYU32((uint32 *)dst0, (const uint16 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(RGB565, XVYU) {
+ do {
+ ConvertRGB16ToXVYU32((uint32 *)dst0, (const uint16 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(RGB888, XVYU) {
+ do {
+ ConvertRGB24ToXVYU32((uint32 *)dst0, (const uint8 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(XRGB8888, XVYU) {
+ do {
+ ConvertRGB32ToXVYU32((uint32 *)dst0, (const uint8 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(XRGB1555, Y8) {
+ do {
+ ConvertRGB15ToY8((uint8 *)dst0, (const uint16 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(RGB565, Y8) {
+ do {
+ ConvertRGB16ToY8((uint8 *)dst0, (const uint16 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(RGB888, Y8) {
+ do {
+ ConvertRGB24ToY8((uint8 *)dst0, (const uint8 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+DECLARE_YUV_REV(XRGB8888, Y8) {
+ do {
+ ConvertRGB32ToY8((uint8 *)dst0, (const uint8 *)src0, w);
+
+ vdptrstep(dst0, dstpitch);
+ vdptrstep(src0, srcpitch);
+ } while(--h);
+}
+
+
+
+
+
+namespace {
+ void ConvertRGB32ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ const sint32 yt = 1052*r + 2065*g + 401*b;
+ *ydst++ = (yt + 67584) >> 12;
+ *crdst++ = (10507932*r - yt*2987 + 2155872256U) >> 24;
+ *cbdst++ = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+ src += 4;
+ } while(--count);
+ }
+
+ void ConvertRGB24ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ const sint32 r = src[2];
+ const sint32 g = src[1];
+ const sint32 b = src[0];
+ const sint32 yt = 1052*r + 2065*g + 401*b;
+ *ydst++ = (yt + 67584) >> 12;
+ *crdst++ = (10507932*r - yt*2987 + 2155872256U) >> 24;
+ *cbdst++ = ( 8312025*b - yt*2363 + 2155872256U) >> 24;
+ src += 3;
+ } while(--count);
+ }
+
+ void ConvertRGB16ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint16 *src = (const uint16 *)src0;
+
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0xf800) >> 11;
+ const sint32 g = (px & 0x07e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ const sint32 yt = 8652*r + 8358*g + 3299*b;
+ *ydst++ = (yt + 67584) >> 12;
+ *crdst++ = (86436217*r - yt*2987 + 2155872256U) >> 24;
+ *cbdst++ = (68373108*b - yt*2363 + 2155872256U) >> 24;
+ } while(--count);
+ }
+
+ void ConvertRGB15ToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint16 *src = (const uint16 *)src0;
+
+ do {
+ const sint16 px = *src++;
+ const sint32 r = (px & 0x7c00) >> 10;
+ const sint32 g = (px & 0x03e0) >> 5;
+ const sint32 b = (px & 0x001f);
+ const sint32 yt = 8652*r + 16986*g + 3299*b;
+ *ydst++ = (yt + 67584) >> 12;
+ *crdst++ = (86436217*r - yt*2987 + 2155872256U) >> 24;
+ *cbdst++ = (68373108*b - yt*2363 + 2155872256U) >> 24;
+ } while(--count);
+ }
+
+ void ConvertUYVYToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ *cbdst++ = src[0];
+ *ydst++ = src[1];
+ *crdst++ = src[2];
+ if (!--count)
+ break;
+ *ydst++ = src[3];
+ src += 4;
+ } while(--count);
+ }
+
+ void ConvertYUYVToYUVPlanar(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src0, sint32 count) {
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ *cbdst++ = src[1];
+ *ydst++ = src[0];
+ *crdst++ = src[3];
+ if (!--count)
+ break;
+ *ydst++ = src[2];
+ src += 4;
+ } while(--count);
+ }
+}
+
+void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dstbm, const VDPixmap& srcbm, vdpixsize w, vdpixsize h) {
+ void (*cfunc)(uint8 *ydst, uint8 *cbdst, uint8 *crdst, const void *src, sint32 w) = NULL;
+ void (*hfunc)(uint8 *dst, const uint8 *src, sint32 w) = NULL;
+ void (*vfunc)(uint8 *dst, const uint8 *const *sources, sint32 w, uint8 phase) = NULL;
+
+ bool halfchroma = false;
+
+ switch(srcbm.format) {
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ cfunc = ConvertRGB15ToYUVPlanar;
+ break;
+ case nsVDPixmap::kPixFormat_RGB565:
+ cfunc = ConvertRGB16ToYUVPlanar;
+ break;
+ case nsVDPixmap::kPixFormat_RGB888:
+ cfunc = ConvertRGB24ToYUVPlanar;
+ break;
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ cfunc = ConvertRGB32ToYUVPlanar;
+ break;
+ case nsVDPixmap::kPixFormat_YUV422_UYVY:
+ cfunc = ConvertUYVYToYUVPlanar;
+ halfchroma = true;
+ break;
+ case nsVDPixmap::kPixFormat_YUV422_YUYV:
+ cfunc = ConvertYUYVToYUVPlanar;
+ halfchroma = true;
+ break;
+ default:
+ VDNEVERHERE;
+ return;
+ }
+
+ vdpixsize w2 = w;
+ vdpixsize h2 = h;
+ int winstep = 1;
+ int winsize = 1;
+ int winposnext = 0;
+ vdpixsize chroma_srcw = w;
+
+ switch(dstbm.format) {
+
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ if (halfchroma)
+ hfunc = horiz_expand2x_coaligned;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ if (halfchroma)
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ else
+ hfunc = horiz_compress2x_coaligned;
+
+ w2 = (w2+1) >> 1;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar_Centered:
+ if (halfchroma) {
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ hfunc = horiz_realign_to_centered;
+ } else
+ hfunc = horiz_compress2x_centered;
+
+ w2 = (w2+1) >> 1;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ if (halfchroma)
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ else
+ hfunc = horiz_compress2x_coaligned;
+
+ vfunc = vert_compress2x_centered;
+ winstep = 2;
+ winposnext = 2;
+ winsize = 4;
+ h2 = (h+1) >> 1;
+ w2 = (w2+1) >> 1;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV420_Planar_Centered:
+ if (halfchroma) {
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ hfunc = horiz_realign_to_centered;
+ } else
+ hfunc = horiz_compress2x_centered;
+
+ vfunc = vert_compress2x_centered;
+ winstep = 2;
+ winposnext = 2;
+ winsize = 4;
+ h2 = (h+1) >> 1;
+ w2 = (w2+1) >> 1;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ if (halfchroma) {
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ hfunc = horiz_compress2x_coaligned;
+ } else
+ hfunc = horiz_compress4x_coaligned;
+ w2 = (w2+1) >> 2;
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ if (halfchroma) {
+ chroma_srcw = (chroma_srcw + 1) >> 1;
+ hfunc = horiz_compress2x_coaligned;
+ } else
+ hfunc = horiz_compress4x_coaligned;
+ vfunc = vert_compress4x_centered;
+ winsize = 8;
+ winposnext = 5;
+ winstep = 4;
+ h2 = (h+3) >> 2;
+ w2 = (w2+3) >> 2;
+ break;
+ }
+
+#ifdef _M_IX86
+ uint32 cpuflags = CPUGetEnabledExtensions();
+
+ if (cpuflags & CPUF_SUPPORTS_INTEGER_SSE) {
+ if (hfunc == horiz_expand2x_coaligned)
+ hfunc = horiz_expand2x_coaligned_ISSE;
+ }
+#endif
+
+ const uint8 *src = (const uint8 *)srcbm.data;
+ const ptrdiff_t srcpitch = srcbm.pitch;
+
+ uint8 *ydst = (uint8 *)dstbm.data;
+ uint8 *cbdst = (uint8 *)dstbm.data2;
+ uint8 *crdst = (uint8 *)dstbm.data3;
+ const ptrdiff_t ydstpitch = dstbm.pitch;
+ const ptrdiff_t cbdstpitch = dstbm.pitch2;
+ const ptrdiff_t crdstpitch = dstbm.pitch3;
+
+ if (!vfunc) {
+ if (hfunc) {
+ uint32 tmpsize = (w + 15) & ~15;
+
+ vdblock<uint8> tmp(tmpsize * 2);
+ uint8 *const cbtmp = tmp.data();
+ uint8 *const crtmp = cbtmp + tmpsize;
+
+ do {
+ cfunc(ydst, cbtmp, crtmp, src, w);
+ src += srcpitch;
+ ydst += ydstpitch;
+ hfunc(cbdst, cbtmp, chroma_srcw);
+ hfunc(crdst, crtmp, chroma_srcw);
+ cbdst += cbdstpitch;
+ crdst += crdstpitch;
+ } while(--h);
+ } else if (dstbm.format == nsVDPixmap::kPixFormat_Y8) {
+ // wasteful, but oh well
+ uint32 tmpsize = (w2+15)&~15;
+ vdblock<uint8> tmp(tmpsize);
+
+ cbdst = tmp.data();
+ crdst = cbdst + tmpsize;
+
+ do {
+ cfunc(ydst, cbdst, crdst, src, w);
+ src += srcpitch;
+ ydst += ydstpitch;
+ } while(--h2);
+ } else {
+ do {
+ cfunc(ydst, cbdst, crdst, src, w);
+ src += srcpitch;
+ ydst += ydstpitch;
+ cbdst += cbdstpitch;
+ crdst += crdstpitch;
+ } while(--h2);
+ }
+ } else {
+ const uint32 tmpsize = w2;
+
+ vdblock<uint8> tmpbuf(tmpsize * (winsize + 1) * 2 + 2 * w);
+
+ uint8 *cbwindow[16];
+ uint8 *crwindow[16];
+
+ uint8 *p = tmpbuf.data();
+ for(int i=0; i<winsize; ++i) {
+ cbwindow[i] = cbwindow[winsize+i] = p;
+ p += tmpsize;
+ crwindow[i] = crwindow[winsize+i] = p;
+ p += tmpsize;
+ }
+
+ uint8 *cbtmp = p;
+ uint8 *crtmp = p + w;
+
+ int winoffset;
+ int winpos = winposnext - winsize;
+ bool firstline = true;
+
+ do {
+ while(winpos < winposnext) {
+ winoffset = ++winpos & (winsize - 1);
+
+ bool valid = (unsigned)(winpos-1) < (unsigned)(h-1); // -1 because we generate line 0 as the first window line
+ if (valid || firstline) {
+ if (hfunc) {
+ cfunc(ydst, cbtmp, crtmp, src, w);
+ hfunc(cbwindow[winoffset + winsize - 1], cbtmp, chroma_srcw);
+ hfunc(crwindow[winoffset + winsize - 1], crtmp, chroma_srcw);
+ } else {
+ cfunc(ydst, cbwindow[winoffset + winsize - 1], crwindow[winoffset + winsize - 1], src, w);
+ }
+ src += srcpitch;
+ ydst += ydstpitch;
+ firstline = false;
+ } else {
+ // dupe last generated line -- could be done by pointer swabbing, but I'm lazy
+ memcpy(cbwindow[winoffset + winsize - 1], cbwindow[winoffset + winsize - 2], w2);
+ memcpy(crwindow[winoffset + winsize - 1], crwindow[winoffset + winsize - 2], w2);
+ }
+ }
+ winposnext += winstep;
+
+ vfunc(cbdst, cbwindow + winoffset, w2, 0);
+ vfunc(crdst, crwindow + winoffset, w2, 0);
+ cbdst += cbdstpitch;
+ crdst += crdstpitch;
+ } while(--h2);
+ }
+
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ __asm emms
+ }
+#endif
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp
new file mode 100644
index 000000000..ce999221a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_setup.cpp
@@ -0,0 +1,17 @@
+#include "blt_setup.h"
+
+void VDPixmapBlitterTable::Clear() {
+ memset(mTable, 0, sizeof mTable);
+}
+
+void VDPixmapBlitterTable::AddBlitter(const VDPixmapFormatSubset& srcFormats, VDPixmapFormatSubset& dstFormats, VDPixmapBlitterFn blitter) {
+ for(int i=0; i<srcFormats.mFormatCount; ++i) {
+ int srcFormat = srcFormats.mFormats[i];
+ for(int j=0; j<dstFormats.mFormatCount; ++j) {
+ int dstFormat = dstFormats.mFormats[j];
+
+ if (srcFormat != dstFormat)
+ mTable[srcFormat][dstFormat] = blitter;
+ }
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp
new file mode 100644
index 000000000..6baeeca36
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils.cpp
@@ -0,0 +1,365 @@
+#include "blt_spanutils.h"
+#include "bitutils.h"
+
+using namespace nsVDPixmapBitUtils;
+
+namespace nsVDPixmapSpanUtils {
+ void horiz_expand2x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+ w = -w;
+
+ *dst++ = *src;
+
+ if (++w) {
+ if (++w) {
+ do {
+ dst[0] = (uint8)((3*src[0] + src[1] + 2)>>2);
+ dst[1] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+ dst += 2;
+ ++src;
+ } while((w+=2)<0);
+ }
+
+ if (!(w & 1)) {
+ *dst = src[0];
+ }
+ }
+ }
+
+ void horiz_expand2x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+ w = -w;
+
+ if ((w+=2) < 0) {
+ do {
+ dst[0] = src[0];
+ dst[1] = (uint8)((src[0] + src[1] + 1)>>1);
+ dst += 2;
+ ++src;
+ } while((w+=2)<0);
+ }
+
+ w -= 2;
+ while(w < 0) {
+ ++w;
+ *dst++ = src[0];
+ }
+ }
+
+ void horiz_expand4x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+ w = -w;
+
+ if ((w+=4) < 0) {
+ do {
+ dst[0] = src[0];
+ dst[1] = (uint8)((3*src[0] + src[1] + 2)>>2);
+ dst[2] = (uint8)((src[0] + src[1] + 1)>>1);
+ dst[3] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+ dst += 4;
+ ++src;
+ } while((w+=4)<0);
+ }
+
+ w -= 4;
+ while(w < 0) {
+ ++w;
+ *dst++ = src[0];
+ }
+ }
+
+ void horiz_compress2x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+ if (w == 1) {
+ *dst = *src;
+ return;
+ }
+
+ *dst++ = (uint8)((3*src[0] + src[1] + 2) >> 2);
+ ++src;
+ --w;
+
+ while(w >= 3) {
+ w -= 2;
+ *dst++ = (uint8)((src[0] + 2*src[1] + src[2] + 2) >> 2);
+ src += 2;
+ }
+
+ if (w >= 2)
+ *dst++ = (uint8)((src[0] + 3*src[1] + 2) >> 2);
+ }
+
+ void horiz_compress2x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+ if (w == 1) {
+ *dst = *src;
+ return;
+ }
+
+ if (w == 2) {
+ *dst = (uint8)((src[0] + src[1] + 1) >> 1);
+ return;
+ }
+
+ *dst++ = (uint8)((4*src[0] + 3*src[1] + src[2] + 4) >> 3);
+ --w;
+ ++src;
+
+ while(w >= 4) {
+ w -= 2;
+ *dst++ = (uint8)(((src[0] + src[3]) + 3*(src[1] + src[2]) + 4) >> 3);
+ src += 2;
+ }
+
+ switch(w) {
+ case 3:
+ *dst++ = (uint8)((src[0] + 3*src[1] + 4*src[2] + 4) >> 3);
+ break;
+ case 2:
+ *dst++ = (uint8)((src[0] + 7*src[1] + 4) >> 3);
+ break;
+ }
+ }
+
+ void horiz_compress4x_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+ if (w == 1) {
+ *dst = *src;
+ return;
+ }
+
+ if (w == 2) {
+ *dst++ = (uint8)((11*src[0] + 5*src[1] + 8) >> 4);
+ return;
+ }
+
+ *dst++ = (uint8)((11*src[0] + 4*src[1] + src[2] + 8) >> 4);
+ src += 2;
+ w -= 2;
+
+ while(w >= 5) {
+ w -= 4;
+ *dst++ = (uint8)(((src[0] + src[4]) + 4*(src[1] + src[3]) + 6*src[2] + 8) >> 4);
+ src += 4;
+ }
+
+ switch(w) {
+ case 4:
+ *dst = (uint8)((src[0] + 4*src[1] + 6*src[2] + 5*src[3] + 8) >> 4);
+ break;
+ case 3:
+ *dst = (uint8)((src[0] + 4*src[1] + 11*src[2] + 8) >> 4);
+ break;
+ }
+ }
+
+ void horiz_compress4x_centered(uint8 *dst, const uint8 *src, sint32 w) {
+
+ switch(w) {
+ case 1:
+ *dst = *src;
+ return;
+ case 2: // 29 99
+ *dst = (uint8)((29*src[0] + 99*src[1] + 64) >> 7);
+ return;
+ case 3: // 29 35 64
+ *dst = (uint8)((29*src[0] + 35*src[1] + 64*src[1] + 64) >> 7);
+ return;
+ case 4: // 29 35 35 29
+ *dst = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 29*src[3] + 64) >> 7);
+ return;
+ case 5: // 29 35 35 21 8
+ // 1 7 120
+ dst[0] = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 21*src[3] + 8*src[4] + 64) >> 7);
+ dst[1] = (uint8)((src[2] + 7*src[3] + 120*src[4] + 64) >> 7);
+ return;
+ }
+
+ *dst++ = (uint8)((29*src[0] + 35*(src[1] + src[2]) + 21*src[3] + 7*src[4] + src[5] + 64) >> 7);
+ src += 2;
+ w -= 2;
+
+ while(w >= 8) {
+ w -= 4;
+ *dst++ = (uint8)(((src[0] + src[7]) + 7*(src[1] + src[6]) + 21*(src[2] + src[5]) + 35*(src[3] + src[4]) + 64) >> 7);
+ src += 4;
+ }
+
+ switch(w) {
+ case 4: // 1 7 21 99
+ *dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 99*src[3] + 64) >> 7);
+ break;
+ case 5: // 1 7 21 35 64
+ *dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 35*src[3] + 64*src[4] + 64) >> 7);
+ break;
+ case 6: // 1 7 21 35 35 29
+ *dst = (uint8)((src[0] + 7*src[1] + 21*src[2] + 29*src[5] + 35*(src[3] + src[4]) + 64) >> 7);
+ break;
+ case 7: // 1 7 21 35 35 21 8
+ // 1 7 120
+ dst[0] = (uint8)((src[0] + 7*src[1] + 8*src[6] + 21*(src[2] + src[5]) + 35*(src[3] + src[4]) + 64) >> 7);
+ dst[1] = (uint8)((src[4] + 7*src[5] + 120*src[6] + 64) >> 7);
+ break;
+ }
+ }
+
+ void horiz_realign_to_centered(uint8 *dst, const uint8 *src, sint32 w) {
+ // luma samples: Y Y Y Y Y
+ // coaligned: C C C
+ // centered: C C
+ //
+ // To realign coaligned samples to centered, we need to shift them
+ // right by a quarter sample in chroma space. This can be done via
+ // a [3 1]/4 filter.
+
+ for(sint32 i=1; i<w; ++i) {
+ dst[0] = (uint8)((3*(uint32)src[0] + (uint32)src[1] + 2) >> 2);
+ ++dst;
+ ++src;
+ }
+
+ *dst++ = *src++;
+ }
+
+ void horiz_realign_to_coaligned(uint8 *dst, const uint8 *src, sint32 w) {
+ // luma samples: Y Y Y Y Y
+ // coaligned: C C C
+ // centered: C C
+ //
+ // To realign centered samples to coaligned, we need to shift them
+ // left by a quarter sample in chroma space. This can be done via
+ // a [1 3]/4 filter.
+
+ *dst++ = *src++;
+
+ for(sint32 i=1; i<w; ++i) {
+ dst[0] = (uint8)(((uint32)src[-1] + 3*(uint32)src[0] + 2) >> 2);
+ ++dst;
+ ++src;
+ }
+ }
+
+ void vert_expand2x_centered(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+ const uint8 *src3 = srcs[0];
+ const uint8 *src1 = srcs[1];
+
+ if (phase >= 128)
+ std::swap(src1, src3);
+
+ sint32 w4 = w>>2;
+ w &= 3;
+
+ if (w4) {
+ const uint32 *src34 = (const uint32 *)src3;
+ const uint32 *src14 = (const uint32 *)src1;
+ uint32 *dst4 = ( uint32 *)dst;
+
+ do {
+ const uint32 a = *src34++;
+ const uint32 b = *src14++;
+ const uint32 ab = (a&b) + (((a^b)&0xfefefefe)>>1);
+
+ *dst4++ = (a|ab) - (((a^ab)&0xfefefefe)>>1);
+ } while(--w4);
+
+ src3 = (const uint8 *)src34;
+ src1 = (const uint8 *)src14;
+ dst = ( uint8 *)dst4;
+ }
+
+ if (w) {
+ do {
+ *dst++ = (uint8)((*src1++ + 3**src3++ + 2) >> 2);
+ } while(--w);
+ }
+ }
+
+ void vert_expand4x_centered(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+ const uint8 *src3 = srcs[0];
+ const uint8 *src1 = srcs[1];
+
+ switch(phase & 0xc0) {
+ case 0x00:
+ do {
+ *dst++ = (uint8)((1**src1++ + 7**src3++ + 4) >> 3);
+ } while(--w);
+ break;
+ case 0x40:
+ do {
+ *dst++ = (uint8)((3**src1++ + 5**src3++ + 4) >> 3);
+ } while(--w);
+ break;
+ case 0x80:
+ do {
+ *dst++ = (uint8)((5**src1++ + 3**src3++ + 4) >> 3);
+ } while(--w);
+ break;
+ case 0xc0:
+ do {
+ *dst++ = (uint8)((7**src1++ + 1**src3++ + 4) >> 3);
+ } while(--w);
+ break;
+ default:
+ VDNEVERHERE;
+ }
+ }
+
+ void vert_compress2x_centered_fast(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+ const uint8 *src1 = srcarray[0];
+ const uint8 *src2 = srcarray[1];
+
+ w = -w;
+ w += 3;
+
+ while(w < 0) {
+ *(uint32 *)dst = avg_8888_11(*(uint32 *)src1, *(uint32 *)src2);
+ dst += 4;
+ src1 += 4;
+ src2 += 4;
+ w += 4;
+ }
+
+ w -= 3;
+
+ while(w < 0) {
+ *dst = (uint8)((*src1 + *src2 + 1)>>1);
+ ++dst;
+ ++src1;
+ ++src2;
+ ++w;
+ }
+ }
+
+ void vert_compress2x_centered(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+ const uint8 *src1 = srcarray[0];
+ const uint8 *src2 = srcarray[1];
+ const uint8 *src3 = srcarray[2];
+ const uint8 *src4 = srcarray[3];
+
+ w = -w;
+
+ while(w < 0) {
+ *dst++ = (uint8)(((*src1++ + *src4++) + 3*(*src2++ + *src3++) + 4)>>3);
+ ++w;
+ }
+ }
+
+ void vert_compress4x_centered(uint8 *dst, const uint8 *const *srcarray, sint32 w, uint8 phase) {
+ const uint8 *src1 = srcarray[0];
+ const uint8 *src2 = srcarray[1];
+ const uint8 *src3 = srcarray[2];
+ const uint8 *src4 = srcarray[3];
+ const uint8 *src5 = srcarray[4];
+ const uint8 *src6 = srcarray[5];
+ const uint8 *src7 = srcarray[6];
+ const uint8 *src8 = srcarray[7];
+
+ w = -w;
+
+ while(w < 0) {
+ int sum18 = *src1++ + *src8++;
+ int sum27 = *src2++ + *src7++;
+ int sum36 = *src3++ + *src6++;
+ int sum45 = *src4++ + *src5++;
+
+ *dst++ = (uint8)((sum18 + 7*sum27 + 21*sum36 + 35*sum45 + 64) >> 7);
+
+ ++w;
+ }
+ }
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp
new file mode 100644
index 000000000..ea9e0599a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_spanutils_x86.cpp
@@ -0,0 +1,170 @@
+// VirtualDub - Video processing and capture application
+// Graphics support library
+// Copyright (C) 1998-2007 Avery Lee
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include "blt_spanutils_x86.h"
+
+#ifdef _MSC_VER
+ #pragma warning(disable: 4799) // warning C4799: function 'nsVDPixmapSpanUtils::vdasm_horiz_expand2x_coaligned_ISSE' has no EMMS instruction
+#endif
+
+extern "C" void __cdecl vdasm_horiz_expand2x_coaligned_ISSE(void *dst, const void *src, uint32 count);
+extern "C" void __cdecl vdasm_horiz_expand4x_coaligned_MMX(void *dst, const void *src, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_13_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_17_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+extern "C" void __cdecl vdasm_vert_average_35_ISSE(void *dst, const void *src1, const void *src3, uint32 count);
+
+namespace nsVDPixmapSpanUtils {
+
+ void horiz_expand2x_coaligned_ISSE(uint8 *dst, const uint8 *src, sint32 w) {
+ if (w >= 17) {
+ uint32 fastcount = (w - 1) & ~15;
+
+ vdasm_horiz_expand2x_coaligned_ISSE(dst, src, fastcount);
+ dst += fastcount;
+ src += fastcount >> 1;
+ w -= fastcount;
+ }
+
+ w = -w;
+ if ((w+=2) < 0) {
+ do {
+ dst[0] = src[0];
+ dst[1] = (uint8)((src[0] + src[1] + 1)>>1);
+ dst += 2;
+ ++src;
+ } while((w+=2)<0);
+ }
+
+ w -= 2;
+ while(w < 0) {
+ ++w;
+ *dst++ = src[0];
+ }
+ }
+
+ void horiz_expand4x_coaligned_MMX(uint8 *dst, const uint8 *src, sint32 w) {
+ if (w >= 17) {
+ uint32 fastcount = (w - 1) >> 4;
+
+ vdasm_horiz_expand4x_coaligned_MMX(dst, src, fastcount);
+ dst += fastcount << 4;
+ src += fastcount << 2;
+ w -= fastcount << 4;
+ }
+
+ w = -w;
+ if ((w+=4) < 0) {
+ do {
+ dst[0] = src[0];
+ dst[1] = (uint8)((3*src[0] + src[1] + 2)>>2);
+ dst[2] = (uint8)((src[0] + src[1] + 1)>>1);
+ dst[3] = (uint8)((src[0] + 3*src[1] + 2)>>2);
+ dst += 4;
+ ++src;
+ } while((w+=4)<0);
+ }
+
+ w -= 4;
+ while(w < 0) {
+ ++w;
+ *dst++ = src[0];
+ }
+ }
+
+ void vert_expand2x_centered_ISSE(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+ const uint8 *src3 = srcs[0];
+ const uint8 *src1 = srcs[1];
+
+ if (phase >= 128)
+ std::swap(src1, src3);
+
+ uint32 fastcount = w & ~15;
+
+ if (fastcount) {
+ vdasm_vert_average_13_ISSE(dst, src1, src3, fastcount);
+ dst += fastcount;
+ src1 += fastcount;
+ src3 += fastcount;
+ w -= fastcount;
+ }
+
+ if (w) {
+ do {
+ *dst++ = (uint8)((*src1++ + 3**src3++ + 2) >> 2);
+ } while(--w);
+ }
+ }
+
+ void vert_average_1_7_ISSE(uint8 *dst, const uint8 *src7, const uint8 *src1, sint32 w) {
+ uint32 fastcount = w & ~7;
+
+ if (fastcount) {
+ vdasm_vert_average_17_ISSE(dst, src1, src7, fastcount);
+ dst += fastcount;
+ src1 += fastcount;
+ src7 += fastcount;
+ w -= fastcount;
+ }
+
+ if (w) {
+ do {
+ *dst++ = (uint8)((*src1++ + 7**src7++ + 4) >> 3);
+ } while(--w);
+ }
+ }
+
+ void vert_average_3_5_ISSE(uint8 *dst, const uint8 *src7, const uint8 *src1, sint32 w) {
+ uint32 fastcount = w & ~7;
+
+ if (fastcount) {
+ vdasm_vert_average_35_ISSE(dst, src1, src7, fastcount);
+ dst += fastcount;
+ src1 += fastcount;
+ src7 += fastcount;
+ w -= fastcount;
+ }
+
+ if (w) {
+ do {
+ *dst++ = (uint8)((3**src1++ + 5**src7++ + 4) >> 3);
+ } while(--w);
+ }
+ }
+
+ void vert_expand4x_centered_ISSE(uint8 *dst, const uint8 *const *srcs, sint32 w, uint8 phase) {
+ const uint8 *src1 = srcs[0];
+ const uint8 *src2 = srcs[1];
+
+ switch(phase & 0xc0) {
+ case 0x00:
+ vert_average_1_7_ISSE(dst, src2, src1, w);
+ break;
+ case 0x40:
+ vert_average_3_5_ISSE(dst, src2, src1, w);
+ break;
+ case 0x80:
+ vert_average_3_5_ISSE(dst, src1, src2, w);
+ break;
+ case 0xc0:
+ vert_average_1_7_ISSE(dst, src1, src2, w);
+ break;
+ default:
+ VDNEVERHERE;
+ }
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp
new file mode 100644
index 000000000..dcaa20907
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_uberblit.cpp
@@ -0,0 +1,19 @@
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmap.h>
+#include "uberblit.h"
+
+void VDPixmapBlt_UberblitAdapter(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h) {
+ vdautoptr<IVDPixmapBlitter> blitter(VDPixmapCreateBlitter(dst, src));
+
+ if (w > src.w)
+ w = src.w;
+ if (w > dst.w)
+ w = dst.w;
+ if (h > src.h)
+ h = src.h;
+ if (h > dst.h)
+ h = dst.h;
+
+ vdrect32 r(0, 0, w, h);
+ blitter->Blit(dst, &r, src);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp
new file mode 100644
index 000000000..af1519c5b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/blt_x86.cpp
@@ -0,0 +1,144 @@
+#include <vd2/system/vdtypes.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "blt_setup.h"
+
+void VDPixmapInitBlittersReference(VDPixmapBlitterTable& table);
+
+#define DECLARE_PALETTED(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h, const void *pal0);
+#define DECLARE_RGB(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_RGB_ASM(x, y) extern "C" void vdasm_pixblt_##x##_to_##y(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_RGB_ASM_MMX(x, y) extern "C" void vdasm_pixblt_##x##_to_##y##_MMX(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_YUV(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h);
+#define DECLARE_YUV_REV(x, y) void VDPixmapBlt_##x##_to_##y##_reference(void *dst0, ptrdiff_t dstpitch, const void *src0, ptrdiff_t srcpitch, vdpixsize w, vdpixsize h)
+#define DECLARE_YUV_PLANAR(x, y) extern void VDPixmapBlt_##x##_to_##y##_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+ DECLARE_RGB_ASM(RGB565, XRGB1555); DECLARE_RGB_ASM_MMX(RGB565, XRGB1555);
+ DECLARE_RGB_ASM(RGB888, XRGB1555);
+ DECLARE_RGB_ASM(XRGB8888, XRGB1555); DECLARE_RGB_ASM_MMX(XRGB8888, XRGB1555);
+ DECLARE_RGB_ASM(XRGB1555, RGB565); DECLARE_RGB_ASM_MMX(XRGB1555, RGB565);
+ DECLARE_RGB_ASM(RGB888, RGB565);
+ DECLARE_RGB_ASM(XRGB8888, RGB565); DECLARE_RGB_ASM_MMX(XRGB8888, RGB565);
+DECLARE_RGB(XRGB1555, RGB888);
+DECLARE_RGB(RGB565, RGB888);
+ DECLARE_RGB_ASM(XRGB8888, RGB888); DECLARE_RGB_ASM_MMX(XRGB8888, RGB888);
+ DECLARE_RGB_ASM(XRGB1555, XRGB8888); DECLARE_RGB_ASM_MMX(XRGB1555, XRGB8888);
+ DECLARE_RGB_ASM(RGB565, XRGB8888); DECLARE_RGB_ASM_MMX(RGB565, XRGB8888);
+ DECLARE_RGB_ASM(RGB888, XRGB8888); DECLARE_RGB_ASM_MMX(RGB888, XRGB8888);
+
+DECLARE_PALETTED(Pal1, Any8);
+DECLARE_PALETTED(Pal1, Any16);
+DECLARE_PALETTED(Pal1, Any24);
+DECLARE_PALETTED(Pal1, Any32);
+DECLARE_PALETTED(Pal2, Any8);
+DECLARE_PALETTED(Pal2, Any16);
+DECLARE_PALETTED(Pal2, Any24);
+DECLARE_PALETTED(Pal2, Any32);
+DECLARE_PALETTED(Pal4, Any8);
+DECLARE_PALETTED(Pal4, Any16);
+DECLARE_PALETTED(Pal4, Any24);
+DECLARE_PALETTED(Pal4, Any32);
+DECLARE_PALETTED(Pal8, Any8);
+DECLARE_PALETTED(Pal8, Any16);
+DECLARE_PALETTED(Pal8, Any24);
+DECLARE_PALETTED(Pal8, Any32);
+
+DECLARE_YUV(XVYU, UYVY);
+DECLARE_YUV(XVYU, YUYV);
+DECLARE_YUV(Y8, UYVY);
+DECLARE_YUV(Y8, YUYV);
+DECLARE_YUV(UYVY, Y8);
+DECLARE_YUV(YUYV, Y8);
+DECLARE_YUV(UYVY, YUYV);
+DECLARE_YUV_PLANAR(YUV411, YV12);
+
+DECLARE_YUV(UYVY, XRGB1555);
+DECLARE_YUV(UYVY, RGB565);
+DECLARE_YUV(UYVY, RGB888);
+DECLARE_YUV(UYVY, XRGB8888);
+DECLARE_YUV(YUYV, XRGB1555);
+DECLARE_YUV(YUYV, RGB565);
+DECLARE_YUV(YUYV, RGB888);
+DECLARE_YUV(YUYV, XRGB8888);
+DECLARE_YUV(Y8, XRGB1555);
+DECLARE_YUV(Y8, RGB565);
+DECLARE_YUV(Y8, RGB888);
+DECLARE_YUV(Y8, XRGB8888);
+
+DECLARE_YUV_REV(XRGB1555, Y8);
+DECLARE_YUV_REV(RGB565, Y8);
+DECLARE_YUV_REV(RGB888, Y8);
+DECLARE_YUV_REV(XRGB8888, Y8);
+
+DECLARE_YUV_REV(XRGB1555, XVYU);
+DECLARE_YUV_REV(RGB565, XVYU);
+DECLARE_YUV_REV(RGB888, XVYU);
+DECLARE_YUV_REV(XRGB8888, XVYU);
+
+DECLARE_YUV_PLANAR(YV12, XRGB1555);
+DECLARE_YUV_PLANAR(YV12, RGB565);
+DECLARE_YUV_PLANAR(YV12, RGB888);
+DECLARE_YUV_PLANAR(YV12, XRGB8888);
+
+DECLARE_YUV_PLANAR(YUV411, XRGB1555);
+DECLARE_YUV_PLANAR(YUV411, RGB565);
+DECLARE_YUV_PLANAR(YUV411, RGB888);
+DECLARE_YUV_PLANAR(YUV411, XRGB8888);
+
+extern void VDPixmapBlt_YUVPlanar_decode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_encode_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+extern void VDPixmapBlt_YUVPlanar_convert_reference(const VDPixmap& dst, const VDPixmap& src, vdpixsize w, vdpixsize h);
+
+using namespace nsVDPixmap;
+
+void VDPixmapInitBlittersX86(VDPixmapBlitterTable& table) {
+ VDPixmapInitBlittersReference(table);
+
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_RGB565>);
+ table.AddBlitter(kPixFormat_XRGB1555, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_XRGB8888>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB1555>);
+ table.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB8888>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB1555>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_RGB565>);
+ table.AddBlitter(kPixFormat_RGB888, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB8888>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_XRGB1555>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB565>);
+ table.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB888>);
+}
+
+tpVDPixBltTable VDGetPixBltTableX86ScalarInternal() {
+ static VDPixmapBlitterTable sReferenceTable;
+
+ VDPixmapInitBlittersX86(sReferenceTable);
+
+ return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86MMXInternal() {
+ static VDPixmapBlitterTable sReferenceTable;
+
+ VDPixmapInitBlittersX86(sReferenceTable);
+
+ sReferenceTable.AddBlitter(kPixFormat_XRGB1555, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_RGB565_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_XRGB1555, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB1555_to_XRGB8888_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB1555_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_RGB565, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB565_to_XRGB8888_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_RGB888, kPixFormat_XRGB8888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_RGB888_to_XRGB8888_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_XRGB8888, kPixFormat_XRGB1555, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_XRGB1555_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB565, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB565_MMX>);
+ sReferenceTable.AddBlitter(kPixFormat_XRGB8888, kPixFormat_RGB888, VDPixmapBlitterChunkyAdapter<vdasm_pixblt_XRGB8888_to_RGB888_MMX>);
+
+ return sReferenceTable.mTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86Scalar() {
+ static tpVDPixBltTable spTable = VDGetPixBltTableX86ScalarInternal();
+
+ return spTable;
+}
+
+tpVDPixBltTable VDGetPixBltTableX86MMX() {
+ static tpVDPixBltTable spTable = VDGetPixBltTableX86MMXInternal();
+
+ return spTable;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp b/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp
new file mode 100644
index 000000000..45797ca4b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/pixel.cpp
@@ -0,0 +1,667 @@
+// VirtualDub - Video processing and capture application
+// Graphics support library
+// Copyright (C) 1998-2007 Avery Lee
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <vd2/system/math.h>
+#include <vd2/system/halffloat.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixel.h>
+
+uint32 VDPixmapSample(const VDPixmap& px, sint32 x, sint32 y) {
+ if (x >= px.w)
+ x = px.w - 1;
+ if (y >= px.h)
+ y = px.h - 1;
+ if (x < 0)
+ x = 0;
+ if (y < 0)
+ y = 0;
+
+ switch(px.format) {
+ case nsVDPixmap::kPixFormat_Pal1:
+ {
+ uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 3];
+
+ return px.palette[(idx >> (7 - (x & 7))) & 1];
+ }
+
+ case nsVDPixmap::kPixFormat_Pal2:
+ {
+ uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 2];
+
+ return px.palette[(idx >> (6 - (x & 3)*2)) & 3];
+ }
+
+ case nsVDPixmap::kPixFormat_Pal4:
+ {
+ uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x >> 1];
+
+ if (!(x & 1))
+ idx >>= 4;
+
+ return px.palette[idx & 15];
+ }
+
+ case nsVDPixmap::kPixFormat_Pal8:
+ {
+ uint8 idx = ((const uint8 *)px.data + px.pitch*y)[x];
+
+ return px.palette[idx];
+ }
+
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ {
+ uint16 c = ((const uint16 *)((const uint8 *)px.data + px.pitch*y))[x];
+ uint32 r = c & 0x7c00;
+ uint32 g = c & 0x03e0;
+ uint32 b = c & 0x001f;
+ uint32 rgb = (r << 9) + (g << 6) + (b << 3);
+
+ return rgb + ((rgb >> 5) & 0x070707);
+ }
+ break;
+
+ case nsVDPixmap::kPixFormat_RGB565:
+ {
+ uint16 c = ((const uint16 *)((const uint8 *)px.data + px.pitch*y))[x];
+ uint32 r = c & 0xf800;
+ uint32 g = c & 0x07e0;
+ uint32 b = c & 0x001f;
+ uint32 rb = (r << 8) + (b << 3);
+
+ return rb + ((rb >> 5) & 0x070007) + (g << 5) + ((g >> 1) & 0x0300);
+ }
+ break;
+
+ case nsVDPixmap::kPixFormat_RGB888:
+ {
+ const uint8 *src = (const uint8 *)px.data + px.pitch*y + 3*x;
+ uint32 b = src[0];
+ uint32 g = src[1];
+ uint32 r = src[2];
+
+ return (r << 16) + (g << 8) + b;
+ }
+ break;
+
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ return ((const uint32 *)((const uint8 *)px.data + px.pitch*y))[x];
+
+ case nsVDPixmap::kPixFormat_Y8:
+ {
+ uint8 luma = ((const uint8 *)px.data + px.pitch*y)[x];
+
+ return ((luma - 16)*255/219) * 0x010101;
+ }
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ return VDConvertYCbCrToRGB(VDPixmapSample8(px.data, px.pitch, x, y), VDPixmapSample8(px.data2, px.pitch2, x, y), VDPixmapSample8(px.data3, px.pitch3, x, y));
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ {
+ sint32 u = (x << 7) + 128;
+ sint32 v = (y << 8);
+ uint32 w2 = px.w >> 1;
+ uint32 h2 = px.h;
+
+ return VDConvertYCbCrToRGB(
+ VDPixmapSample8(px.data, px.pitch, x, y),
+ VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+ VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+ }
+
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ {
+ sint32 u = (x << 7) + 128;
+ sint32 v = (y << 7);
+ uint32 w2 = px.w >> 1;
+ uint32 h2 = px.h >> 1;
+
+ return VDConvertYCbCrToRGB(
+ VDPixmapSample8(px.data, px.pitch, x, y),
+ VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+ VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+ }
+
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ {
+ sint32 u = (x << 6) + 128;
+ sint32 v = (y << 8);
+ uint32 w2 = px.w >> 2;
+ uint32 h2 = px.h;
+
+ return VDConvertYCbCrToRGB(
+ VDPixmapSample8(px.data, px.pitch, x, y),
+ VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+ VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+ }
+
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ {
+ sint32 u = (x << 6) + 128;
+ sint32 v = (y << 6);
+ uint32 w2 = px.w >> 2;
+ uint32 h2 = px.h >> 2;
+
+ return VDConvertYCbCrToRGB(
+ VDPixmapSample8(px.data, px.pitch, x, y),
+ VDPixmapInterpolateSample8(px.data2, px.pitch2, w2, h2, u, v),
+ VDPixmapInterpolateSample8(px.data3, px.pitch3, w2, h2, u, v));
+ }
+
+ default:
+ return VDPixmapInterpolateSampleRGB24(px, (x << 8) + 128, (y << 8) + 128);
+ }
+}
+
+uint8 VDPixmapInterpolateSample8(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+ // bias coordinates to integer
+ x_256 -= 128;
+ y_256 -= 128;
+
+ // clamp coordinates
+ x_256 &= ~(x_256 >> 31);
+ y_256 &= ~(y_256 >> 31);
+
+ uint32 w_256 = (w - 1) << 8;
+ uint32 h_256 = (h - 1) << 8;
+ x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+ y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+ const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8);
+ const uint8 *row1 = row0;
+
+ if ((uint32)y_256 < h_256)
+ row1 += pitch;
+
+ ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+ sint32 xoffset = x_256 & 255;
+ sint32 yoffset = y_256 & 255;
+ sint32 p00 = row0[0];
+ sint32 p10 = row0[xstep];
+ sint32 p01 = row1[0];
+ sint32 p11 = row1[xstep];
+ sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+ sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+ sint32 p = ((p0 << 8) + (p1 - p0)*yoffset + 0x8000) >> 16;
+
+ return (uint8)p;
+}
+
+uint32 VDPixmapInterpolateSample8To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+ // bias coordinates to integer
+ x_256 -= 128;
+ y_256 -= 128;
+
+ // clamp coordinates
+ x_256 &= ~(x_256 >> 31);
+ y_256 &= ~(y_256 >> 31);
+
+ uint32 w_256 = (w - 1) << 8;
+ uint32 h_256 = (h - 1) << 8;
+ x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+ y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+ const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8);
+ const uint8 *row1 = row0;
+
+ if ((uint32)y_256 < h_256)
+ row1 += pitch;
+
+ ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+ sint32 xoffset = x_256 & 255;
+ sint32 yoffset = y_256 & 255;
+ sint32 p00 = row0[0];
+ sint32 p10 = row0[xstep];
+ sint32 p01 = row1[0];
+ sint32 p11 = row1[xstep];
+ sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+ sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+ sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+ return p;
+}
+
+uint32 VDPixmapInterpolateSample8x2To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+ // bias coordinates to integer
+ x_256 -= 128;
+ y_256 -= 128;
+
+ // clamp coordinates
+ x_256 &= ~(x_256 >> 31);
+ y_256 &= ~(y_256 >> 31);
+
+ uint32 w_256 = (w - 1) << 8;
+ uint32 h_256 = (h - 1) << 8;
+ x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+ y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+ const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*2;
+ const uint8 *row1 = row0;
+
+ if ((uint32)y_256 < h_256)
+ row1 += pitch;
+
+ ptrdiff_t xstep = (uint32)x_256 < w_256 ? 2 : 0;
+ sint32 xoffset = x_256 & 255;
+ sint32 yoffset = y_256 & 255;
+ sint32 p00 = row0[0];
+ sint32 p10 = row0[xstep];
+ sint32 p01 = row1[0];
+ sint32 p11 = row1[xstep];
+ sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+ sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+ sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+ return p;
+}
+
+uint32 VDPixmapInterpolateSample8x4To24(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+ // bias coordinates to integer
+ x_256 -= 128;
+ y_256 -= 128;
+
+ // clamp coordinates
+ x_256 &= ~(x_256 >> 31);
+ y_256 &= ~(y_256 >> 31);
+
+ uint32 w_256 = (w - 1) << 8;
+ uint32 h_256 = (h - 1) << 8;
+ x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+ y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+ const uint8 *row0 = (const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*4;
+ const uint8 *row1 = row0;
+
+ if ((uint32)y_256 < h_256)
+ row1 += pitch;
+
+ ptrdiff_t xstep = (uint32)x_256 < w_256 ? 4 : 0;
+ sint32 xoffset = x_256 & 255;
+ sint32 yoffset = y_256 & 255;
+ sint32 p00 = row0[0];
+ sint32 p10 = row0[xstep];
+ sint32 p01 = row1[0];
+ sint32 p11 = row1[xstep];
+ sint32 p0 = (p00 << 8) + (p10 - p00)*xoffset;
+ sint32 p1 = (p01 << 8) + (p11 - p01)*xoffset;
+ sint32 p = (p0 << 8) + (p1 - p0)*yoffset;
+
+ return p;
+}
+
+float VDPixmapInterpolateSample16F(const void *data, ptrdiff_t pitch, uint32 w, uint32 h, sint32 x_256, sint32 y_256) {
+ // bias coordinates to integer
+ x_256 -= 128;
+ y_256 -= 128;
+
+ // clamp coordinates
+ x_256 &= ~(x_256 >> 31);
+ y_256 &= ~(y_256 >> 31);
+
+ uint32 w_256 = (w - 1) << 8;
+ uint32 h_256 = (h - 1) << 8;
+ x_256 ^= (x_256 ^ w_256) & ((x_256 - w_256) >> 31);
+ y_256 ^= (y_256 ^ h_256) & ((y_256 - h_256) >> 31);
+
+ const uint16 *row0 = (const uint16 *)((const uint8 *)data + pitch * (y_256 >> 8) + (x_256 >> 8)*2);
+ const uint16 *row1 = row0;
+
+ if ((uint32)y_256 < h_256)
+ row1 = (const uint16 *)((const char *)row1 + pitch);
+
+ ptrdiff_t xstep = (uint32)x_256 < w_256 ? 1 : 0;
+ float xoffset = (float)(x_256 & 255) * (1.0f / 255.0f);
+ float yoffset = (float)(y_256 & 255) * (1.0f / 255.0f);
+
+ float p00;
+ float p10;
+ float p01;
+ float p11;
+ VDConvertHalfToFloat(row0[0], &p00);
+ VDConvertHalfToFloat(row0[xstep], &p10);
+ VDConvertHalfToFloat(row1[0], &p01);
+ VDConvertHalfToFloat(row1[xstep], &p11);
+
+ float p0 = p00 + (p10 - p00)*xoffset;
+ float p1 = p01 + (p11 - p01)*xoffset;
+
+ return p0 + (p1 - p0)*yoffset;
+}
+
+namespace {
+ uint32 Lerp8888(uint32 p0, uint32 p1, uint32 p2, uint32 p3, uint32 xf, uint32 yf) {
+ uint32 rb0 = p0 & 0x00ff00ff;
+ uint32 ag0 = p0 & 0xff00ff00;
+ uint32 rb1 = p1 & 0x00ff00ff;
+ uint32 ag1 = p1 & 0xff00ff00;
+ uint32 rb2 = p2 & 0x00ff00ff;
+ uint32 ag2 = p2 & 0xff00ff00;
+ uint32 rb3 = p3 & 0x00ff00ff;
+ uint32 ag3 = p3 & 0xff00ff00;
+
+ uint32 rbt = (rb0 + ((( rb1 - rb0 )*xf + 0x00800080) >> 8)) & 0x00ff00ff;
+ uint32 agt = (ag0 + ((((ag1 >> 8) - (ag0 >> 8))*xf + 0x00800080) )) & 0xff00ff00;
+ uint32 rbb = (rb2 + ((( rb3 - rb2 )*xf + 0x00800080) >> 8)) & 0x00ff00ff;
+ uint32 agb = (ag2 + ((((ag3 >> 8) - (ag2 >> 8))*xf + 0x00800080) )) & 0xff00ff00;
+ uint32 rb = (rbt + ((( rbb - rbt )*yf + 0x00800080) >> 8)) & 0x00ff00ff;
+ uint32 ag = (agt + ((((agb >> 8) - (agt >> 8))*yf + 0x00800080) )) & 0xff00ff00;
+
+ return rb + ag;
+ }
+
+ uint32 InterpPlanarY8(const VDPixmap& px, sint32 x1, sint32 y1) {
+ sint32 y = VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x1, y1);
+
+ return VDClampedRoundFixedToUint8Fast((float)(y-0x100000) * (1.1643836f/65536.0f/255.0f))*0x010101;
+ }
+
+ uint32 InterpPlanarYCC888(const VDPixmap& px, sint32 x1, sint32 y1, sint32 x23, sint32 y23, uint32 w23, uint32 h23) {
+ float y = (float)(sint32)VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x1, y1);
+ float cb = (float)(sint32)VDPixmapInterpolateSample8To24(px.data2, px.pitch2, w23, h23, x23, y23);
+ float cr = (float)(sint32)VDPixmapInterpolateSample8To24(px.data3, px.pitch3, w23, h23, x23, y23);
+
+ // ! 1.1643836 - 5.599D-17 1.5960268 - 222.92157 !
+ // ! 1.1643836 - 0.3917623 - 0.8129676 135.57529 !
+ // ! 1.1643836 2.0172321 - 1.110D-16 - 276.83585 !
+ uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.5960268f/65536.0f/255.0f)*cr - (222.92157f / 255.0f));
+ uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.3917623f/65536.0f/255.0f)*cb - (0.8129676f/65536.0f/255.0f)*cr + (135.57529f / 255.0f));
+ uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.0172321f/65536.0f/255.0f)*cb - (276.83585f / 255.0f));
+
+ return (ir << 16) + (ig << 8) + ib;
+ }
+
+ uint32 ConvertYCC72ToRGB24(sint32 iy, sint32 icb, sint32 icr) {
+ float y = (float)iy;
+ float cb = (float)icb;
+ float cr = (float)icr;
+
+ // ! 1.1643836 - 5.599D-17 1.5960268 - 222.92157 !
+ // ! 1.1643836 - 0.3917623 - 0.8129676 135.57529 !
+ // ! 1.1643836 2.0172321 - 1.110D-16 - 276.83585 !
+ uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.5960268f/65536.0f/255.0f)*cr - (222.92157f / 255.0f));
+ uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.3917623f/65536.0f/255.0f)*cb - (0.8129676f/65536.0f/255.0f)*cr + (135.57529f / 255.0f));
+ uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.0172321f/65536.0f/255.0f)*cb - (276.83585f / 255.0f));
+
+ return (ir << 16) + (ig << 8) + ib;
+ }
+
+ uint32 ConvertYCC72ToRGB24_709(sint32 iy, sint32 icb, sint32 icr) {
+ float y = (float)iy;
+ float cb = (float)icb;
+ float cr = (float)icr;
+
+ // ! 1.1643836 - 2.932D-17 1.7927411 - 248.10099 !
+ // ! 1.1643836 - 0.2132486 - 0.5329093 76.87808 !
+ // ! 1.1643836 2.1124018 - 5.551D-17 - 289.01757 !
+ uint32 ir = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (1.7927411f/65536.0f/255.0f)*cr - (248.10099f / 255.0f));
+ uint32 ig = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y - (0.2132486f/65536.0f/255.0f)*cb - (0.5329093f/65536.0f/255.0f)*cr + (76.87808f / 255.0f));
+ uint32 ib = VDClampedRoundFixedToUint8Fast((1.1643836f/65536.0f/255.0f)*y + (2.1124018f/65536.0f/255.0f)*cb - (289.01757f / 255.0f));
+
+ return (ir << 16) + (ig << 8) + ib;
+ }
+
+ uint32 SampleV210_Y(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+ const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 6)*4;
+
+ switch((uint32)x % 6) {
+ default:
+ case 0: return (p[0] >> 10) & 0x3ff;
+ case 1: return (p[1] >> 0) & 0x3ff;
+ case 2: return (p[1] >> 20) & 0x3ff;
+ case 3: return (p[2] >> 10) & 0x3ff;
+ case 4: return (p[3] >> 0) & 0x3ff;
+ case 5: return (p[3] >> 20) & 0x3ff;
+ }
+ }
+
+ uint32 SampleV210_Cb(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+ const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 3)*4;
+
+ switch((uint32)x % 3) {
+ default:
+ case 0: return (p[0] >> 0) & 0x3ff;
+ case 1: return (p[1] >> 10) & 0x3ff;
+ case 2: return (p[2] >> 20) & 0x3ff;
+ }
+ }
+
+ uint32 SampleV210_Cr(const void *src, ptrdiff_t srcpitch, sint32 x, sint32 y, uint32 w, uint32 h) {
+ const uint32 *p = (const uint32 *)((const char *)src + srcpitch*y) + (x / 3)*4;
+
+ switch((uint32)x % 3) {
+ default:
+ case 0: return (p[0] >> 20) & 0x3ff;
+ case 1: return (p[2] >> 0) & 0x3ff;
+ case 2: return (p[3] >> 10) & 0x3ff;
+ }
+ }
+}
+
+uint32 VDPixmapInterpolateSampleRGB24(const VDPixmap& px, sint32 x_256, sint32 y_256) {
+ switch(px.format) {
+ case nsVDPixmap::kPixFormat_Pal1:
+ case nsVDPixmap::kPixFormat_Pal2:
+ case nsVDPixmap::kPixFormat_Pal4:
+ case nsVDPixmap::kPixFormat_Pal8:
+ case nsVDPixmap::kPixFormat_RGB565:
+ case nsVDPixmap::kPixFormat_RGB888:
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ {
+ x_256 -= 128;
+ y_256 -= 128;
+ int ix = x_256 >> 8;
+ int iy = y_256 >> 8;
+ uint32 p0 = VDPixmapSample(px, ix, iy);
+ uint32 p1 = VDPixmapSample(px, ix+1, iy);
+ uint32 p2 = VDPixmapSample(px, ix, iy+1);
+ uint32 p3 = VDPixmapSample(px, ix+1, iy+1);
+
+ return Lerp8888(p0, p1, p2, p3, x_256 & 255, y_256 & 255);
+ }
+ break;
+
+ case nsVDPixmap::kPixFormat_Y8:
+ return InterpPlanarY8(px, x_256, y_256);
+
+ case nsVDPixmap::kPixFormat_YUV422_UYVY:
+ return ConvertYCC72ToRGB24(
+ VDPixmapInterpolateSample8x2To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+ );
+
+ case nsVDPixmap::kPixFormat_YUV422_YUYV:
+ return ConvertYCC72ToRGB24(
+ VDPixmapInterpolateSample8x2To24((const char *)px.data + 0, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 1, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 3, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+ );
+
+ case nsVDPixmap::kPixFormat_YUV444_XVYU:
+ return ConvertYCC72ToRGB24(
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, px.w, px.h, x_256, y_256)
+ );
+
+ case nsVDPixmap::kPixFormat_YUV422_UYVY_709:
+ return ConvertYCC72ToRGB24_709(
+ VDPixmapInterpolateSample8x2To24((const char *)px.data + 1, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 0, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256),
+ VDPixmapInterpolateSample8x4To24((const char *)px.data + 2, px.pitch, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256)
+ );
+
+ case nsVDPixmap::kPixFormat_YUV420_NV12:
+ return ConvertYCC72ToRGB24(
+ VDPixmapInterpolateSample8To24(px.data, px.pitch, px.w, px.h, x_256, y_256),
+ VDPixmapInterpolateSample8x2To24((const char *)px.data2 + 0, px.pitch2, (px.w + 1) >> 1, (px.h + 1) >> 1, (x_256 >> 1) + 128, y_256 >> 1),
+ VDPixmapInterpolateSample8x2To24((const char *)px.data2 + 1, px.pitch2, (px.w + 1) >> 1, (px.h + 1) >> 1, (x_256 >> 1) + 128, y_256 >> 1)
+ );
+
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ return InterpPlanarYCC888(px, x_256, y_256, x_256, y_256, px.w, px.h);
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 1) + 128, y_256, (px.w + 1) >> 1, px.h);
+
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 2) + 128, y_256, (px.w + 3) >> 2, px.h);
+
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 1) + 128, y_256 >> 1, (px.w + 1) >> 1, (px.h + 1) >> 1);
+
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ return InterpPlanarYCC888(px, x_256, y_256, (x_256 >> 2) + 128, y_256 >> 2, (px.w + 3) >> 2, (px.h + 3) >> 2);
+
+ case nsVDPixmap::kPixFormat_YUV420_Planar_Centered:
+ return InterpPlanarYCC888(px, x_256, y_256, x_256 >> 1, y_256 >> 1, (px.w + 1) >> 1, (px.h + 1) >> 1);
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar_Centered:
+ return InterpPlanarYCC888(px, x_256, y_256, x_256 >> 1, y_256, (px.w + 1) >> 1, px.h);
+
+ case nsVDPixmap::kPixFormat_YUV422_Planar_16F:
+ {
+ float y = VDPixmapInterpolateSample16F(px.data, px.pitch, px.w, px.h, x_256, y_256);
+ float cb = VDPixmapInterpolateSample16F(px.data2, px.pitch2, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256);
+ float cr = VDPixmapInterpolateSample16F(px.data3, px.pitch3, (px.w + 1) >> 1, px.h, (x_256 >> 1) + 128, y_256);
+
+ uint32 ir = VDClampedRoundFixedToUint8Fast(1.1643836f*y + 1.5960268f*cr - (222.92157f / 255.0f));
+ uint32 ig = VDClampedRoundFixedToUint8Fast(1.1643836f*y - 0.3917623f*cb - 0.8129676f*cr + (135.57529f / 255.0f));
+ uint32 ib = VDClampedRoundFixedToUint8Fast(1.1643836f*y + 2.0172321f*cb - (276.83585f / 255.0f));
+
+ return (ir << 16) + (ig << 8) + ib;
+ }
+
+ case nsVDPixmap::kPixFormat_YUV422_V210:
+ {
+ sint32 luma_x = x_256 - 128;
+ sint32 luma_y = y_256 - 128;
+
+ if (luma_x < 0)
+ luma_x = 0;
+
+ if (luma_y < 0)
+ luma_y = 0;
+
+ if (luma_x > (sint32)((px.w - 1) << 8))
+ luma_x = (sint32)((px.w - 1) << 8);
+
+ if (luma_y > (sint32)((px.h - 1) << 8))
+ luma_y = (sint32)((px.h - 1) << 8);
+
+ sint32 luma_ix = luma_x >> 8;
+ sint32 luma_iy = luma_y >> 8;
+ float luma_fx = (float)(luma_x & 255) * (1.0f / 255.0f);
+ float luma_fy = (float)(luma_y & 255) * (1.0f / 255.0f);
+
+ float y0 = SampleV210_Y(px.data, px.pitch, luma_ix+0, luma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float y1 = SampleV210_Y(px.data, px.pitch, luma_ix+1, luma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float y2 = SampleV210_Y(px.data, px.pitch, luma_ix+0, luma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float y3 = SampleV210_Y(px.data, px.pitch, luma_ix+1, luma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float yt = y0 + (y1 - y0)*luma_fx;
+ float yb = y2 + (y3 - y2)*luma_fx;
+ float yr = yt + (yb - yt)*luma_fy;
+
+ uint32 chroma_w = (px.w + 1) >> 1;
+ uint32 chroma_h = px.h;
+ sint32 chroma_x = x_256 >> 1;
+ sint32 chroma_y = y_256 - 128;
+
+ if (chroma_x < 0)
+ chroma_x = 0;
+
+ if (chroma_y < 0)
+ chroma_y = 0;
+
+ if (chroma_x > (sint32)((chroma_w - 1) << 8))
+ chroma_x = (sint32)((chroma_w - 1) << 8);
+
+ if (chroma_y > (sint32)((chroma_h - 1) << 8))
+ chroma_y = (sint32)((chroma_h - 1) << 8);
+
+ sint32 chroma_ix = chroma_x >> 8;
+ sint32 chroma_iy = chroma_y >> 8;
+ float chroma_fx = (float)(chroma_x & 255) * (1.0f / 255.0f);
+ float chroma_fy = (float)(chroma_y & 255) * (1.0f / 255.0f);
+
+ float cb0 = SampleV210_Cb(px.data, px.pitch, chroma_ix+0, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float cb1 = SampleV210_Cb(px.data, px.pitch, chroma_ix+1, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float cb2 = SampleV210_Cb(px.data, px.pitch, chroma_ix+0, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float cb3 = SampleV210_Cb(px.data, px.pitch, chroma_ix+1, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float cbt = cb0 + (cb1 - cb0)*chroma_fx;
+ float cbb = cb2 + (cb3 - cb2)*chroma_fx;
+ float cbr = cbt + (cbb - cbt)*chroma_fy;
+
+ float cr0 = SampleV210_Cr(px.data, px.pitch, chroma_ix+0, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float cr1 = SampleV210_Cr(px.data, px.pitch, chroma_ix+1, chroma_iy+0, px.w, px.h) * (1.0f / 1023.0f);
+ float cr2 = SampleV210_Cr(px.data, px.pitch, chroma_ix+0, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float cr3 = SampleV210_Cr(px.data, px.pitch, chroma_ix+1, chroma_iy+1, px.w, px.h) * (1.0f / 1023.0f);
+ float crt = cr0 + (cr1 - cr0)*chroma_fx;
+ float crb = cr2 + (cr3 - cr2)*chroma_fx;
+ float crr = crt + (crb - crt)*chroma_fy;
+
+ uint32 ir = VDClampedRoundFixedToUint8Fast(1.1643836f*yr + 1.5960268f*crr - (222.92157f / 255.0f));
+ uint32 ig = VDClampedRoundFixedToUint8Fast(1.1643836f*yr - 0.3917623f*cbr - 0.8129676f*crr + (135.57529f / 255.0f));
+ uint32 ib = VDClampedRoundFixedToUint8Fast(1.1643836f*yr + 2.0172321f*cbr - (276.83585f / 255.0f));
+
+ return (ir << 16) + (ig << 8) + ib;
+ }
+ break;
+
+ default:
+ return 0;
+ }
+}
+
+uint32 VDConvertYCbCrToRGB(uint8 y0, uint8 cb0, uint8 cr0) {
+ sint32 y = y0 - 16;
+ sint32 cb = cb0 - 128;
+ sint32 cr = cr0 - 128;
+
+ sint32 y2 = y * 76309 + 0x8000;
+ sint32 r = y2 + cr * 104597;
+ sint32 g = y2 + cr * -53279 + cb * -25674;
+ sint32 b = y2 + cb * 132201;
+
+ r &= ~(r >> 31);
+ g &= ~(g >> 31);
+ b &= ~(b >> 31);
+ r += (0xffffff - r) & ((0xffffff - r) >> 31);
+ g += (0xffffff - g) & ((0xffffff - g) >> 31);
+ b += (0xffffff - b) & ((0xffffff - b) >> 31);
+
+ return (r & 0xff0000) + ((g & 0xff0000) >> 8) + (b >> 16);
+}
+
+uint32 VDConvertRGBToYCbCr(uint32 c) {
+ return VDConvertRGBToYCbCr((uint8)(c >> 16), (uint8)(c >> 8), (uint8)c);
+}
+
+uint32 VDConvertRGBToYCbCr(uint8 r8, uint8 g8, uint8 b8) {
+ sint32 r = r8;
+ sint32 g = g8;
+ sint32 b = b8;
+ sint32 yt = 1052*r + 2065*g + 401*b;
+ sint32 y = (yt + 0x10800) >> 4;
+ sint32 cr = (10507932*r - yt*2987 + 0x80800000U) >> 8;
+ sint32 cb = ( 8312025*b - yt*2363 + 0x80800000U) >> 24;
+
+ return (uint8)cb + (y & 0xff00) + (cr&0xff0000);
+} \ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp b/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp
new file mode 100644
index 000000000..635cbf3c0
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/pixmaputils.cpp
@@ -0,0 +1,519 @@
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/system/memory.h>
+
+extern VDPixmapFormatInfo g_vdPixmapFormats[] = {
+ // name qchnk qw qh qwb qhb qs ab aw ah as ps
+ /* Null */ { "null", false, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+ /* Pal1 */ { "Pal1", true, 8, 1, 3, 0, 1, 0, 0, 0, 0, 2 },
+ /* Pal2 */ { "Pal2", true, 4, 1, 2, 0, 1, 0, 0, 0, 0, 4 },
+ /* Pal4 */ { "Pal4", true, 2, 1, 1, 0, 1, 0, 0, 0, 0, 16 },
+ /* Pal8 */ { "Pal8", false, 1, 1, 0, 0, 1, 0, 0, 0, 0, 256 },
+ /* RGB16_555 */ { "XRGB1555", false, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0 },
+ /* RGB16_565 */ { "RGB565", false, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0 },
+ /* RGB24 */ { "RGB888", false, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0 },
+ /* RGB32 */ { "XRGB8888", false, 1, 1, 0, 0, 4, 0, 0, 0, 0, 0 },
+ /* Y8 */ { "Y8", false, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 },
+ /* YUV422_UYVY */ { "UYVY", true, 2, 1, 1, 0, 4, 0, 0, 0, 0, 0 },
+ /* YUV422_YUYV */ { "YUYV", true, 2, 1, 1, 0, 4, 0, 0, 0, 0, 0 },
+ /* YUV444_XVYU */ { "XVYU", false, 1, 1, 0, 0, 4, 0, 0, 0, 0, 0 },
+ /* YUV444_Planar */ { "YUV444", false, 1, 1, 0, 0, 1, 2, 0, 0, 1, 0 },
+ /* YUV422_Planar */ { "YUV422", false, 1, 1, 0, 0, 1, 2, 1, 0, 1, 0 },
+ /* YUV420_Planar */ { "YUV420", false, 1, 1, 0, 0, 1, 2, 1, 1, 1, 0 },
+ /* YUV411_Planar */ { "YUV411", false, 1, 1, 0, 0, 1, 2, 2, 0, 1, 0 },
+ /* YUV410_Planar */ { "YUV410", false, 1, 1, 0, 0, 1, 2, 2, 2, 1, 0 },
+ /* YUV422_Planar_Centered */ { "YUV422C", false, 1, 1, 0, 0, 1, 2, 1, 0, 1, 0 },
+ /* YUV420_Planar_Centered */ { "YUV420C", false, 1, 1, 0, 0, 1, 2, 1, 1, 1, 0 },
+ /* YUV422_Planar_16F */ { "YUV422_16F", false, 1, 1, 0, 0, 2, 2, 1, 0, 2, 0 },
+ /* V210 */ { "v210", true,24, 1, 2, 0, 64, 0, 0, 0, 1, 0 },
+ /* YUV422_UYVY_709 */ { "UYVY-709", true, 2, 1, 1, 0, 4, 0, 0, 0, 0, 0 },
+ /* NV12 */ { "NV12", false, 1, 1, 0, 0, 1, 1, 1, 1, 2, 0 },
+};
+
+#ifdef _DEBUG
+ bool VDIsValidPixmapPlane(const void *p, ptrdiff_t pitch, vdpixsize w, vdpixsize h) {
+ bool isvalid;
+
+ if (pitch < 0)
+ isvalid = VDIsValidReadRegion((const char *)p + pitch*(h-1), (-pitch)*(h-1)+w);
+ else
+ isvalid = VDIsValidReadRegion(p, pitch*(h-1)+w);
+
+ if (!isvalid) {
+ VDDEBUG("Kasumi: Invalid pixmap plane detected.\n"
+ " Base=%p, pitch=%d, size=%dx%d (bytes)\n", p, (int)pitch, w, h);
+ }
+
+ return isvalid;
+ }
+
+ bool VDAssertValidPixmap(const VDPixmap& px) {
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(px.format);
+
+ if (px.format) {
+ if (!VDIsValidPixmapPlane(px.data, px.pitch, -(-px.w / info.qw)*info.qsize, -(-px.h >> info.qhbits))) {
+ VDDEBUG("Kasumi: Invalid primary plane detected in pixmap.\n"
+ " Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+ VDASSERT(!"Kasumi: Invalid primary plane detected in pixmap.\n");
+ return false;
+ }
+
+ if (info.palsize)
+ if (!VDIsValidReadRegion(px.palette, sizeof(uint32) * info.palsize)) {
+ VDDEBUG("Kasumi: Invalid palette detected in pixmap.\n"
+ " Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+ VDASSERT(!"Kasumi: Invalid palette detected in pixmap.\n");
+ return false;
+ }
+
+ if (info.auxbufs) {
+ const vdpixsize auxw = -(-px.w >> info.auxwbits);
+ const vdpixsize auxh = -(-px.h >> info.auxhbits);
+
+ if (!VDIsValidPixmapPlane(px.data2, px.pitch2, auxw * info.auxsize, auxh)) {
+ VDDEBUG("Kasumi: Invalid Cb plane detected in pixmap.\n"
+ " Pixmap info: format=%d (%s), dimensions=%dx%d\n", px.format, info.name, px.w, px.h);
+ VDASSERT(!"Kasumi: Invalid Cb plane detected in pixmap.\n");
+ return false;
+ }
+
+ if (info.auxbufs > 2) {
+ if (!VDIsValidPixmapPlane(px.data3, px.pitch3, auxw * info.auxsize, auxh)) {
+ VDDEBUG("Kasumi: Invalid Cr plane detected in pixmap.\n"
+ " Pixmap info: format=%d, dimensions=%dx%d\n", px.format, px.w, px.h);
+ VDASSERT(!"Kasumi: Invalid Cr plane detected in pixmap.\n");
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+#endif
+
+VDPixmap VDPixmapOffset(const VDPixmap& src, vdpixpos x, vdpixpos y) {
+ VDPixmap temp(src);
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(temp.format);
+
+ if (info.qchunky) {
+ x = (x + info.qw - 1) / info.qw;
+ y >>= info.qhbits;
+ }
+
+ switch(info.auxbufs) {
+ case 2:
+ temp.data3 = (char *)temp.data3 + (x >> info.auxwbits)*info.auxsize + (y >> info.auxhbits)*temp.pitch3;
+ case 1:
+ temp.data2 = (char *)temp.data2 + (x >> info.auxwbits)*info.auxsize + (y >> info.auxhbits)*temp.pitch2;
+ case 0:
+ temp.data = (char *)temp.data + x*info.qsize + y*temp.pitch;
+ }
+
+ return temp;
+}
+
+VDPixmapLayout VDPixmapLayoutOffset(const VDPixmapLayout& src, vdpixpos x, vdpixpos y) {
+ VDPixmapLayout temp(src);
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(temp.format);
+
+ if (info.qchunky) {
+ x = (x + info.qw - 1) / info.qw;
+ y = -(-y >> info.qhbits);
+ }
+
+ switch(info.auxbufs) {
+ case 2:
+ temp.data3 += -(-x >> info.auxwbits)*info.auxsize + -(-y >> info.auxhbits)*temp.pitch3;
+ case 1:
+ temp.data2 += -(-x >> info.auxwbits)*info.auxsize + -(-y >> info.auxhbits)*temp.pitch2;
+ case 0:
+ temp.data += x*info.qsize + y*temp.pitch;
+ }
+
+ return temp;
+}
+
+uint32 VDPixmapCreateLinearLayout(VDPixmapLayout& layout, int format, vdpixsize w, vdpixsize h, int alignment) {
+ const ptrdiff_t alignmask = alignment - 1;
+
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(format);
+ sint32 qw = (w + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-h >> srcinfo.qhbits);
+ sint32 subw = -(-w >> srcinfo.auxwbits);
+ sint32 subh = -(-h >> srcinfo.auxhbits);
+ sint32 auxsize = srcinfo.auxsize;
+
+ ptrdiff_t mainpitch = (srcinfo.qsize * qw + alignmask) & ~alignmask;
+ size_t mainsize = mainpitch * qh;
+
+ layout.data = 0;
+ layout.pitch = mainpitch;
+ layout.palette = NULL;
+ layout.data2 = 0;
+ layout.pitch2 = 0;
+ layout.data3 = 0;
+ layout.pitch3 = 0;
+ layout.w = w;
+ layout.h = h;
+ layout.format = format;
+
+ if (srcinfo.auxbufs >= 1) {
+ ptrdiff_t subpitch = (subw * auxsize + alignmask) & ~alignmask;
+ size_t subsize = subpitch * subh;
+
+ layout.data2 = mainsize;
+ layout.pitch2 = subpitch;
+ mainsize += subsize;
+
+ if (srcinfo.auxbufs >= 2) {
+ layout.data3 = mainsize;
+ layout.pitch3 = subpitch;
+ mainsize += subsize;
+ }
+ }
+
+ return mainsize;
+}
+
+void VDPixmapFlipV(VDPixmap& px) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(px.format);
+ sint32 w = px.w;
+ sint32 h = px.h;
+ sint32 qw = (w + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-h >> srcinfo.qhbits);
+ sint32 subh = -(-h >> srcinfo.auxhbits);
+
+ vdptrstep(px.data, px.pitch * (qh - 1));
+ px.pitch = -px.pitch;
+
+ if (srcinfo.auxbufs >= 1) {
+ vdptrstep(px.data2, px.pitch2 * (subh - 1));
+ px.pitch2 = -px.pitch2;
+
+ if (srcinfo.auxbufs >= 2) {
+ vdptrstep(px.data3, px.pitch3 * (subh - 1));
+ px.pitch3 = -px.pitch3;
+ }
+ }
+}
+
+void VDPixmapLayoutFlipV(VDPixmapLayout& layout) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+ sint32 w = layout.w;
+ sint32 h = layout.h;
+ sint32 qw = (w + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-h >> srcinfo.qhbits);
+ sint32 subh = -(-h >> srcinfo.auxhbits);
+
+ layout.data += layout.pitch * (qh - 1);
+ layout.pitch = -layout.pitch;
+
+ if (srcinfo.auxbufs >= 1) {
+ layout.data2 += layout.pitch2 * (subh - 1);
+ layout.pitch2 = -layout.pitch2;
+
+ if (srcinfo.auxbufs >= 2) {
+ layout.data3 += layout.pitch3 * (subh - 1);
+ layout.pitch3 = -layout.pitch3;
+ }
+ }
+}
+
+uint32 VDPixmapLayoutGetMinSize(const VDPixmapLayout& layout) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+ sint32 w = layout.w;
+ sint32 h = layout.h;
+ sint32 qw = (w + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-h >> srcinfo.qhbits);
+ sint32 subh = -(-h >> srcinfo.auxhbits);
+
+ uint32 limit = layout.data;
+ if (layout.pitch >= 0)
+ limit += layout.pitch * qh;
+ else
+ limit -= layout.pitch;
+
+ if (srcinfo.auxbufs >= 1) {
+ uint32 limit2 = layout.data2;
+
+ if (layout.pitch2 >= 0)
+ limit2 += layout.pitch2 * subh;
+ else
+ limit2 -= layout.pitch2;
+
+ if (limit < limit2)
+ limit = limit2;
+
+ if (srcinfo.auxbufs >= 2) {
+ uint32 limit3 = layout.data3;
+
+ if (layout.pitch3 >= 0)
+ limit3 += layout.pitch3 * subh;
+ else
+ limit3 -= layout.pitch3;
+
+ if (limit < limit3)
+ limit = limit3;
+ }
+ }
+
+ return limit;
+}
+
+VDPixmap VDPixmapExtractField(const VDPixmap& src, bool field2) {
+ VDPixmap px(src);
+
+ if (field2) {
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(px.format);
+
+ if (px.data) {
+ if (info.qh == 1)
+ vdptrstep(px.data, px.pitch);
+
+ if (!info.auxhbits) {
+ vdptrstep(px.data2, px.pitch2);
+ vdptrstep(px.data3, px.pitch3);
+ }
+ }
+ }
+
+ px.h >>= 1;
+ px.pitch += px.pitch;
+ px.pitch2 += px.pitch2;
+ px.pitch3 += px.pitch3;
+ return px;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmap& src)
+ : mpBuffer(NULL)
+ , mLinearSize(0)
+{
+ assign(src);
+}
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmapBuffer& src)
+ : mpBuffer(NULL)
+ , mLinearSize(0)
+{
+ assign(src);
+}
+
+VDPixmapBuffer::VDPixmapBuffer(const VDPixmapLayout& layout) {
+ init(layout);
+}
+
+VDPixmapBuffer::~VDPixmapBuffer() {
+#ifdef _DEBUG
+ validate();
+#endif
+
+ delete[] mpBuffer;
+}
+
+void VDPixmapBuffer::init(sint32 width, sint32 height, int f) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(f);
+ sint32 qw = (width + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-height >> srcinfo.qhbits);
+ sint32 subw = -(-width >> srcinfo.auxwbits);
+ sint32 subh = -(-height >> srcinfo.auxhbits);
+ ptrdiff_t mainpitch = (srcinfo.qsize * qw + 15) & ~15;
+ ptrdiff_t subpitch = (srcinfo.auxsize * subw + 15) & ~15;
+ size_t mainsize = mainpitch * qh;
+ size_t subsize = subpitch * subh;
+ size_t totalsize = mainsize + subsize*srcinfo.auxbufs + 4 * srcinfo.palsize;
+
+#ifdef _DEBUG
+ totalsize += 28;
+#endif
+
+ if (mLinearSize != totalsize) {
+ clear();
+ mpBuffer = new char[totalsize + 15];
+ mLinearSize = totalsize;
+ }
+
+ char *p = mpBuffer + (-(int)(uintptr)mpBuffer & 15);
+
+#ifdef _DEBUG
+ *(uint32 *)p = totalsize;
+ for(int i=0; i<12; ++i)
+ p[4+i] = (char)(0xa0 + i);
+
+ p += 16;
+#endif
+
+ data = p;
+ pitch = mainpitch;
+ p += mainsize;
+
+ palette = NULL;
+ data2 = NULL;
+ pitch2 = NULL;
+ data3 = NULL;
+ pitch3 = NULL;
+ w = width;
+ h = height;
+ format = f;
+
+ if (srcinfo.auxbufs >= 1) {
+ data2 = p;
+ pitch2 = subpitch;
+ p += subsize;
+ }
+
+ if (srcinfo.auxbufs >= 2) {
+ data3 = p;
+ pitch3 = subpitch;
+ p += subsize;
+ }
+
+ if (srcinfo.palsize) {
+ palette = (const uint32 *)p;
+ p += srcinfo.palsize * 4;
+ }
+
+#ifdef _DEBUG
+ for(int j=0; j<12; ++j)
+ p[j] = (char)(0xb0 + j);
+#endif
+}
+
+void VDPixmapBuffer::init(const VDPixmapLayout& layout) {
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(layout.format);
+ sint32 qw = (layout.w + srcinfo.qw - 1) / srcinfo.qw;
+ sint32 qh = -(-layout.h >> srcinfo.qhbits);
+ sint32 subw = -(-layout.w >> srcinfo.auxwbits);
+ sint32 subh = -(-layout.h >> srcinfo.auxhbits);
+
+ ptrdiff_t mino=0, maxo=0;
+
+ if (layout.pitch < 0) {
+ mino = std::min<ptrdiff_t>(mino, layout.data + layout.pitch * (qh-1));
+ maxo = std::max<ptrdiff_t>(maxo, layout.data - layout.pitch);
+ } else {
+ mino = std::min<ptrdiff_t>(mino, layout.data);
+ maxo = std::max<ptrdiff_t>(maxo, layout.data + layout.pitch*qh);
+ }
+
+ if (srcinfo.auxbufs >= 1) {
+ if (layout.pitch2 < 0) {
+ mino = std::min<ptrdiff_t>(mino, layout.data2 + layout.pitch2 * (subh-1));
+ maxo = std::max<ptrdiff_t>(maxo, layout.data2 - layout.pitch2);
+ } else {
+ mino = std::min<ptrdiff_t>(mino, layout.data2);
+ maxo = std::max<ptrdiff_t>(maxo, layout.data2 + layout.pitch2*subh);
+ }
+
+ if (srcinfo.auxbufs >= 2) {
+ if (layout.pitch3 < 0) {
+ mino = std::min<ptrdiff_t>(mino, layout.data3 + layout.pitch3 * (subh-1));
+ maxo = std::max<ptrdiff_t>(maxo, layout.data3 - layout.pitch3);
+ } else {
+ mino = std::min<ptrdiff_t>(mino, layout.data3);
+ maxo = std::max<ptrdiff_t>(maxo, layout.data3 + layout.pitch3*subh);
+ }
+ }
+ }
+
+ ptrdiff_t linsize = ((maxo - mino + 3) & ~(uintptr)3);
+
+ ptrdiff_t totalsize = linsize + 4*srcinfo.palsize;
+
+#ifdef _DEBUG
+ totalsize += 28;
+#endif
+
+ if (mLinearSize != totalsize) {
+ clear();
+ mpBuffer = new char[totalsize + 15];
+ mLinearSize = totalsize;
+ }
+
+ char *p = mpBuffer + (-(int)(uintptr)mpBuffer & 15);
+
+#ifdef _DEBUG
+ *(uint32 *)p = totalsize - 28;
+ for(int i=0; i<12; ++i)
+ p[4+i] = (char)(0xa0 + i);
+
+ p += 16;
+#endif
+
+ w = layout.w;
+ h = layout.h;
+ format = layout.format;
+ data = p + layout.data - mino;
+ data2 = p + layout.data2 - mino;
+ data3 = p + layout.data3 - mino;
+ pitch = layout.pitch;
+ pitch2 = layout.pitch2;
+ pitch3 = layout.pitch3;
+ palette = NULL;
+
+ if (srcinfo.palsize) {
+ palette = (const uint32 *)(p + linsize);
+ memcpy((void *)palette, layout.palette, 4*srcinfo.palsize);
+ }
+
+#ifdef _DEBUG
+ for(int j=0; j<12; ++j)
+ p[totalsize + j - 28] = (char)(0xb0 + j);
+#endif
+
+ VDAssertValidPixmap(*this);
+}
+
+void VDPixmapBuffer::assign(const VDPixmap& src) {
+ if (!src.format) {
+ delete[] mpBuffer;
+ mpBuffer = NULL;
+ data = NULL;
+ format = 0;
+ } else {
+ init(src.w, src.h, src.format);
+
+ const VDPixmapFormatInfo& srcinfo = VDPixmapGetInfo(src.format);
+ int qw = (src.w + srcinfo.qw - 1) / srcinfo.qw;
+ int qh = -(-src.h >> srcinfo.qhbits);
+ int subw = -(-src.w >> srcinfo.auxwbits);
+ int subh = -(-src.h >> srcinfo.auxhbits);
+
+ if (srcinfo.palsize)
+ memcpy((void *)palette, src.palette, 4 * srcinfo.palsize);
+
+ switch(srcinfo.auxbufs) {
+ case 2:
+ VDMemcpyRect(data3, pitch3, src.data3, src.pitch3, subw, subh);
+ case 1:
+ VDMemcpyRect(data2, pitch2, src.data2, src.pitch2, subw, subh);
+ case 0:
+ VDMemcpyRect(data, pitch, src.data, src.pitch, qw * srcinfo.qsize, qh);
+ }
+ }
+}
+
+void VDPixmapBuffer::swap(VDPixmapBuffer& dst) {
+ std::swap(mpBuffer, dst.mpBuffer);
+ std::swap(mLinearSize, dst.mLinearSize);
+ std::swap(static_cast<VDPixmap&>(*this), static_cast<VDPixmap&>(dst));
+}
+
+#ifdef _DEBUG
+void VDPixmapBuffer::validate() {
+ if (mpBuffer) {
+ char *p = (char *)(((uintptr)mpBuffer + 15) & ~(uintptr)15);
+
+ // verify head bytes
+ for(int i=0; i<12; ++i)
+ if (p[i+4] != (char)(0xa0 + i))
+ VDASSERT(!"VDPixmapBuffer: Buffer underflow detected.\n");
+
+ // verify tail bytes
+ for(int j=0; j<12; ++j)
+ if (p[mLinearSize - 12 + j] != (char)(0xb0 + j))
+ VDASSERT(!"VDPixmapBuffer: Buffer overflow detected.\n");
+ }
+}
+#endif \ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/region.cpp b/src/thirdparty/VirtualDub/Kasumi/source/region.cpp
new file mode 100644
index 000000000..283f43cf8
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/region.cpp
@@ -0,0 +1,1334 @@
+// VirtualDub - Video processing and capture application
+// Graphics support library
+// Copyright (C) 1998-2007 Avery Lee
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/region.h>
+#include <vd2/system/math.h>
+#include <vd2/system/vdstl.h>
+
+void VDPixmapRegion::swap(VDPixmapRegion& x) {
+ mSpans.swap(x.mSpans);
+ std::swap(mBounds, x.mBounds);
+}
+
+VDPixmapPathRasterizer::VDPixmapPathRasterizer()
+ : mpEdgeBlocks(NULL)
+ , mpFreeEdgeBlocks(NULL)
+ , mEdgeBlockIdx(kEdgeBlockMax)
+ , mpScanBuffer(NULL)
+{
+ ClearScanBuffer();
+}
+
+VDPixmapPathRasterizer::VDPixmapPathRasterizer(const VDPixmapPathRasterizer&)
+ : mpEdgeBlocks(NULL)
+ , mpFreeEdgeBlocks(NULL)
+ , mEdgeBlockIdx(kEdgeBlockMax)
+ , mpScanBuffer(NULL)
+{
+ ClearScanBuffer();
+}
+
+VDPixmapPathRasterizer::~VDPixmapPathRasterizer() {
+ Clear();
+ FreeEdgeLists();
+}
+
+VDPixmapPathRasterizer& VDPixmapPathRasterizer::operator=(const VDPixmapPathRasterizer&) {
+ return *this;
+}
+
+void VDPixmapPathRasterizer::Clear() {
+ ClearEdgeList();
+ ClearScanBuffer();
+}
+
+void VDPixmapPathRasterizer::QuadraticBezier(const vdint2 *pts) {
+ int x0 = pts[0].x;
+ int x1 = pts[1].x;
+ int x2 = pts[2].x;
+ int y0 = pts[0].y;
+ int y1 = pts[1].y;
+ int y2 = pts[2].y;
+
+ // P = (1-t)^2*P0 + 2t(1-t)*P1 + t^2*P2
+ // P = (1-2t+t^2)P0 + 2(t-t^2)P1 + t^2*P2
+ // P = (P0-2P1+P2)t^2 + 2(P1-P0)t + P0
+
+ int cx2 = x0-2*x1+x2;
+ int cx1 = -2*x0+2*x1;
+ int cx0 = x0;
+
+ int cy2 = y0-2*y1+y2;
+ int cy1 = -2*y0+2*y1;
+ int cy0 = y0;
+
+ // This equation is from Graphics Gems I.
+ //
+ // The idea is that since we're approximating a cubic curve with lines,
+ // any error we incur is due to the curvature of the line, which we can
+ // estimate by calculating the maximum acceleration of the curve. For
+ // a cubic, the acceleration (second derivative) is a line, meaning that
+ // the absolute maximum acceleration must occur at either the beginning
+ // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
+ // conservative than that, but that's okay.
+ //
+ // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
+ // that component of the curve is linear and does not incur any error.
+ // If a=0 for both X and Y, the curve is a line segment and we can
+ // use a step size of 1.
+
+ int maxaccel1 = abs(cy2);
+ int maxaccel2 = abs(cx2);
+
+ int maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
+ int h = 1;
+
+ while(maxaccel > 8 && h < 1024) {
+ maxaccel >>= 2;
+ h += h;
+ }
+
+ int lastx = x0;
+ int lasty = y0;
+
+ // compute forward differences
+ sint64 h1 = (sint64)(0x40000000 / h) << 2;
+ sint64 h2 = h1/h;
+
+ sint64 ax0 = (sint64)cx0 << 32;
+ sint64 ax1 = h1*(sint64)cx1 + h2*(sint64)cx2;
+ sint64 ax2 = 2*h2*(sint64)cx2;
+
+ sint64 ay0 = (sint64)cy0 << 32;
+ sint64 ay1 = h1*(sint64)cy1 + h2*(sint64)cy2;
+ sint64 ay2 = 2*h2*(sint64)cy2;
+
+ // round, not truncate
+ ax0 += 0x80000000;
+ ay0 += 0x80000000;
+
+ do {
+ ax0 += ax1;
+ ax1 += ax2;
+ ay0 += ay1;
+ ay1 += ay2;
+
+ int xi = (int)((uint64)ax0 >> 32);
+ int yi = (int)((uint64)ay0 >> 32);
+
+ FastLine(lastx, lasty, xi, yi);
+ lastx = xi;
+ lasty = yi;
+ } while(--h);
+}
+
+void VDPixmapPathRasterizer::CubicBezier(const vdint2 *pts) {
+ int x0 = pts[0].x;
+ int x1 = pts[1].x;
+ int x2 = pts[2].x;
+ int x3 = pts[3].x;
+ int y0 = pts[0].y;
+ int y1 = pts[1].y;
+ int y2 = pts[2].y;
+ int y3 = pts[3].y;
+
+ int cx3 = - x0+3*x1-3*x2+x3;
+ int cx2 = 3*x0-6*x1+3*x2;
+ int cx1 = -3*x0+3*x1;
+ int cx0 = x0;
+
+ int cy3 = - y0+3*y1-3*y2+y3;
+ int cy2 = 3*y0-6*y1+3*y2;
+ int cy1 = -3*y0+3*y1;
+ int cy0 = y0;
+
+ // This equation is from Graphics Gems I.
+ //
+ // The idea is that since we're approximating a cubic curve with lines,
+ // any error we incur is due to the curvature of the line, which we can
+ // estimate by calculating the maximum acceleration of the curve. For
+ // a cubic, the acceleration (second derivative) is a line, meaning that
+ // the absolute maximum acceleration must occur at either the beginning
+ // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
+ // conservative than that, but that's okay.
+ //
+ // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
+ // that component of the curve is linear and does not incur any error.
+ // If a=0 for both X and Y, the curve is a line segment and we can
+ // use a step size of 1.
+
+ int maxaccel1 = abs(2*cy2) + abs(6*cy3);
+ int maxaccel2 = abs(2*cx2) + abs(6*cx3);
+
+ int maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
+ int h = 1;
+
+ while(maxaccel > 8 && h < 1024) {
+ maxaccel >>= 2;
+ h += h;
+ }
+
+ int lastx = x0;
+ int lasty = y0;
+
+ // compute forward differences
+ sint64 h1 = (sint64)(0x40000000 / h) << 2;
+ sint64 h2 = h1/h;
+ sint64 h3 = h2/h;
+
+ sint64 ax0 = (sint64)cx0 << 32;
+ sint64 ax1 = h1*(sint64)cx1 + h2*(sint64)cx2 + h3*(sint64)cx3;
+ sint64 ax2 = 2*h2*(sint64)cx2 + 6*h3*(sint64)cx3;
+ sint64 ax3 = 6*h3*(sint64)cx3;
+
+ sint64 ay0 = (sint64)cy0 << 32;
+ sint64 ay1 = h1*(sint64)cy1 + h2*(sint64)cy2 + h3*(sint64)cy3;
+ sint64 ay2 = 2*h2*(sint64)cy2 + 6*h3*(sint64)cy3;
+ sint64 ay3 = 6*h3*(sint64)cy3;
+
+ // round, not truncate
+ ax0 += 0x80000000;
+ ay0 += 0x80000000;
+
+ do {
+ ax0 += ax1;
+ ax1 += ax2;
+ ax2 += ax3;
+ ay0 += ay1;
+ ay1 += ay2;
+ ay2 += ay3;
+
+ int xi = (int)((uint64)ax0 >> 32);
+ int yi = (int)((uint64)ay0 >> 32);
+
+ FastLine(lastx, lasty, xi, yi);
+ lastx = xi;
+ lasty = yi;
+ } while(--h);
+}
+
+void VDPixmapPathRasterizer::Line(const vdint2& pt1, const vdint2& pt2) {
+ FastLine(pt1.x, pt1.y, pt2.x, pt2.y);
+}
+
+void VDPixmapPathRasterizer::FastLine(int x0, int y0, int x1, int y1) {
+ int flag = 1;
+
+ if (y1 == y0)
+ return;
+
+ if (y1 < y0) {
+ int t;
+
+ t=x0; x0=x1; x1=t;
+ t=y0; y0=y1; y1=t;
+ flag = 0;
+ }
+
+ int dy = y1-y0;
+ int xacc = x0<<13;
+
+ // prestep y0 down
+ int iy0 = (y0+3) >> 3;
+ int iy1 = (y1+3) >> 3;
+
+ if (iy0 < iy1) {
+ int invslope = (x1-x0)*65536/dy;
+
+ int prestep = (4-y0) & 7;
+ xacc += (invslope * prestep)>>3;
+
+ if (iy0 < mScanYMin || iy1 > mScanYMax) {
+ ReallocateScanBuffer(iy0, iy1);
+ VDASSERT(iy0 >= mScanYMin && iy1 <= mScanYMax);
+ }
+
+ while(iy0 < iy1) {
+ int ix = (xacc+32767)>>16;
+
+ if (mEdgeBlockIdx >= kEdgeBlockMax) {
+ if (mpFreeEdgeBlocks) {
+ EdgeBlock *newBlock = mpFreeEdgeBlocks;
+ mpFreeEdgeBlocks = mpFreeEdgeBlocks->next;
+ newBlock->next = mpEdgeBlocks;
+ mpEdgeBlocks = newBlock;
+ } else {
+ mpEdgeBlocks = new EdgeBlock(mpEdgeBlocks);
+ }
+
+ mEdgeBlockIdx = 0;
+ }
+
+ Edge& e = mpEdgeBlocks->edges[mEdgeBlockIdx];
+ Scan& s = mpScanBufferBiased[iy0];
+ VDASSERT(iy0 >= mScanYMin && iy0 < mScanYMax);
+ ++mEdgeBlockIdx;
+
+ e.posandflag = ix*2+flag;
+ e.next = s.chain;
+ s.chain = &e;
+ ++s.count;
+
+ ++iy0;
+ xacc += invslope;
+ }
+ }
+}
+
+void VDPixmapPathRasterizer::ScanConvert(VDPixmapRegion& region) {
+ // Convert the edges to spans. We couldn't do this before because some of
+ // the regions may have winding numbers >+1 and it would have been a pain
+ // to try to adjust the spans on the fly. We use one heap to detangle
+ // a scanline's worth of edges from the singly-linked lists, and another
+ // to collect the actual scans.
+ vdfastvector<int> heap;
+
+ region.mSpans.clear();
+ int xmin = INT_MAX;
+ int xmax = INT_MIN;
+ int ymin = INT_MAX;
+ int ymax = INT_MIN;
+
+ for(int y=mScanYMin; y<mScanYMax; ++y) {
+ uint32 flipcount = mpScanBufferBiased[y].count;
+
+ if (!flipcount)
+ continue;
+
+ // Keep the edge heap from doing lots of stupid little reallocates.
+ if (heap.capacity() < flipcount)
+ heap.resize((flipcount + 63)&~63);
+
+ // Detangle scanline into edge heap.
+ int *heap0 = heap.data();
+ int *heap1 = heap0;
+ for(const Edge *ptr = mpScanBufferBiased[y].chain; ptr; ptr = ptr->next)
+ *heap1++ = ptr->posandflag;
+
+ VDASSERT(heap1 - heap0 == flipcount);
+
+ // Sort edge heap. Note that we conveniently made the opening edges
+ // one more than closing edges at the same spot, so we won't have any
+ // problems with abutting spans.
+
+ std::sort(heap0, heap1);
+
+#if 0
+ while(heap0 != heap1) {
+ int x = *heap0++ >> 1;
+ region.mSpans.push_back((y<<16) + x + 0x80008000);
+ region.mSpans.push_back((y<<16) + x + 0x80008001);
+ }
+ continue;
+#endif
+
+ // Trim any odd edges off, since we can never close on one.
+ if (flipcount & 1)
+ --heap1;
+
+ // Process edges and add spans. Since we only check for a non-zero
+ // winding number, it doesn't matter which way the outlines go. Also, since
+ // the parity always flips after each edge regardless of direction, we can
+ // process the edges in pairs.
+
+ size_t spanstart = region.mSpans.size();
+
+ int x_left;
+ int count = 0;
+ while(heap0 != heap1) {
+ int x = *heap0++;
+
+ if (!count)
+ x_left = (x>>1);
+
+ count += (x&1);
+
+ x = *heap0++;
+
+ count += (x&1);
+
+ if (!--count) {
+ int x_right = (x>>1);
+
+ if (x_right > x_left) {
+ region.mSpans.push_back((y<<16) + x_left + 0x80008000);
+ region.mSpans.push_back((y<<16) + x_right + 0x80008000);
+
+ }
+ }
+ }
+
+ size_t spanend = region.mSpans.size();
+
+ if (spanend > spanstart) {
+ if (ymin > y)
+ ymin = y;
+
+ if (ymax < y)
+ ymax = y;
+
+ int x1 = (region.mSpans[spanstart] & 0xffff) - 0x8000;
+ int x2 = (region.mSpans[spanend-1] & 0xffff) - 0x8000;
+
+ if (xmin > x1)
+ xmin = x1;
+
+ if (xmax < x2)
+ xmax = x2;
+ }
+ }
+
+ if (xmax > xmin) {
+ region.mBounds.set(xmin, ymin, xmax, ymax);
+ } else {
+ region.mBounds.set(0, 0, 0, 0);
+ }
+
+ // Dump the edge and scan buffers, since we no longer need them.
+ ClearEdgeList();
+ ClearScanBuffer();
+}
+
+void VDPixmapPathRasterizer::ClearEdgeList() {
+ if (mpEdgeBlocks) {
+ EdgeBlock *block = mpEdgeBlocks;
+
+ while(EdgeBlock *next = block->next)
+ block = next;
+
+ block->next = mpFreeEdgeBlocks;
+ mpFreeEdgeBlocks = mpEdgeBlocks;
+ mpEdgeBlocks = NULL;
+ }
+
+ mEdgeBlockIdx = kEdgeBlockMax;
+}
+
+void VDPixmapPathRasterizer::FreeEdgeLists() {
+ ClearEdgeList();
+
+ while(EdgeBlock *block = mpFreeEdgeBlocks) {
+ mpFreeEdgeBlocks = block->next;
+
+ delete block;
+ }
+}
+
+void VDPixmapPathRasterizer::ClearScanBuffer() {
+ delete[] mpScanBuffer;
+ mpScanBuffer = mpScanBufferBiased = NULL;
+ mScanYMin = 0;
+ mScanYMax = 0;
+}
+
+void VDPixmapPathRasterizer::ReallocateScanBuffer(int ymin, int ymax) {
+ //
+ // check if there actually is a scan buffer to avoid unintentionally pinning at zero
+ if (mpScanBuffer) {
+ int nicedelta = (mScanYMax - mScanYMin);
+
+ if (ymin < mScanYMin) {
+ int yminnice = mScanYMin - nicedelta;
+ if (ymin > yminnice)
+ ymin = yminnice;
+
+ ymin &= ~31;
+ } else
+ ymin = mScanYMin;
+
+ if (ymax > mScanYMax) {
+ int ymaxnice = mScanYMax + nicedelta;
+ if (ymax < ymaxnice)
+ ymax = ymaxnice;
+
+ ymax = (ymax + 31) & ~31;
+ } else
+ ymax = mScanYMax;
+
+ VDASSERT(ymin <= mScanYMin && ymax >= mScanYMax);
+ }
+
+ // reallocate scan buffer
+ Scan *pNewBuffer = new Scan[ymax - ymin];
+ Scan *pNewBufferBiased = pNewBuffer - ymin;
+
+ if (mpScanBuffer) {
+ memcpy(pNewBufferBiased + mScanYMin, mpScanBufferBiased + mScanYMin, (mScanYMax - mScanYMin) * sizeof(Scan));
+ delete[] mpScanBuffer;
+
+ // zero new areas of scan buffer
+ for(int y=ymin; y<mScanYMin; ++y) {
+ pNewBufferBiased[y].chain = NULL;
+ pNewBufferBiased[y].count = 0;
+ }
+
+ for(int y=mScanYMax; y<ymax; ++y) {
+ pNewBufferBiased[y].chain = NULL;
+ pNewBufferBiased[y].count = 0;
+ }
+ } else {
+ for(int y=ymin; y<ymax; ++y) {
+ pNewBufferBiased[y].chain = NULL;
+ pNewBufferBiased[y].count = 0;
+ }
+ }
+
+ mpScanBuffer = pNewBuffer;
+ mpScanBufferBiased = pNewBufferBiased;
+ mScanYMin = ymin;
+ mScanYMax = ymax;
+}
+
+bool VDPixmapFillRegion(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+ if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+ return false;
+
+ // fast out
+ if (region.mSpans.empty())
+ return true;
+
+ // check if vertical clipping is required
+ const size_t n = region.mSpans.size();
+ uint32 start = 0;
+ uint32 end = n;
+
+ uint32 spanmin = (-x) + ((-y) << 16) + 0x80008000;
+
+ if (region.mSpans.front() < spanmin) {
+ uint32 lo = 0, hi = n;
+
+ // compute top clip
+ while(lo < hi) {
+ int mid = ((lo + hi) >> 1) & ~1;
+
+ if (region.mSpans[mid + 1] < spanmin)
+ lo = mid + 2;
+ else
+ hi = mid;
+ }
+
+ start = lo;
+
+ // check for total top clip
+ if (start >= n)
+ return true;
+ }
+
+ uint32 spanlimit = (dst.w - x) + ((dst.h - y - 1) << 16) + 0x80008000;
+
+ if (region.mSpans.back() > spanlimit) {
+ // compute bottom clip
+ int lo = start;
+ int hi = n;
+
+ while(lo < hi) {
+ int mid = ((lo + hi) >> 1) & ~1;
+
+ if (region.mSpans[mid] >= spanlimit)
+ hi = mid;
+ else
+ lo = mid+2;
+ }
+
+ end = lo;
+
+ // check for total bottom clip
+ if (start >= end)
+ return true;
+ }
+
+ // fill region
+ const uint32 *pSpan = &region.mSpans[start];
+ const uint32 *pEnd = &region.mSpans[0] + end;
+ int lasty = -1;
+ uint32 *dstp;
+
+ for(; pSpan != pEnd; pSpan += 2) {
+ uint32 span0 = pSpan[0];
+ uint32 span1 = pSpan[1];
+
+ uint32 py = (span0 >> 16) - 0x8000 + y;
+ uint32 px = (span0 & 0xffff) - 0x8000 + x;
+ uint32 w = span1-span0;
+
+ VDASSERT(py < (uint32)dst.h);
+ VDASSERT(px < (uint32)dst.w);
+ VDASSERT(dst.w - (int)px >= (int)w);
+
+ if (lasty != py)
+ dstp = (uint32 *)vdptroffset(dst.data, dst.pitch * py);
+
+ uint32 *p = dstp + px;
+ do {
+ *p++ = color;
+ } while(--w);
+ }
+
+ return true;
+}
+
+namespace {
+ void RenderABuffer32(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+ if (!w)
+ return;
+
+ // update dest pointer
+ uint32 *dstp = (uint32 *)vdptroffset(dst.data, dst.pitch * y);
+
+ const uint32 color_rb = color & 0x00FF00FF;
+ const uint32 color_g = color & 0x0000FF00;
+ do {
+ const uint32 px = *dstp;
+ const uint32 px_rb = px & 0x00FF00FF;
+ const uint32 px_g = px & 0x0000FF00;
+ const sint32 a = *alpha++;
+
+ const uint32 result_rb = (((px_rb << 6) + ((sint32)(color_rb - px_rb)*a + 0x00200020)) & 0x3FC03FC0);
+ const uint32 result_g = (((px_g << 6) + ((sint32)(color_g - px_g )*a + 0x00002000)) & 0x003FC000);
+
+ *dstp++ = (result_rb + result_g) >> 6;
+ } while(--w);
+ }
+
+ void RenderABuffer8(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+ if (!w)
+ return;
+
+ // update dest pointer
+ uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+ do {
+ const uint8 px = *dstp;
+ const sint8 a = *alpha++;
+
+ *dstp++ = px + (((sint32)(color - px) * a + 32) >> 6);
+ } while(--w);
+ }
+
+ void RenderABuffer8_128(const VDPixmap& dst, int y, const uint8 *alpha, uint32 w, uint32 color) {
+ if (!w)
+ return;
+
+ // update dest pointer
+ uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+ do {
+ const uint8 px = *dstp;
+ const sint16 a = *alpha++;
+
+ *dstp++ = px + (((sint32)(color - px) * a + 64) >> 7);
+ } while(--w);
+ }
+
+ void RenderABuffer8_256(const VDPixmap& dst, int y, const uint16 *alpha, uint32 w, uint32 color) {
+ if (!w)
+ return;
+
+ // update dest pointer
+ uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+ do {
+ const uint8 px = *dstp;
+ const sint32 a = *alpha++;
+
+ *dstp++ = px + (((sint32)(color - px) * a + 128) >> 8);
+ } while(--w);
+ }
+
+ void RenderABuffer8_1024(const VDPixmap& dst, int y, const uint16 *alpha, uint32 w, uint32 color) {
+ if (!w)
+ return;
+
+ // update dest pointer
+ uint8 *dstp = (uint8 *)vdptroffset(dst.data, dst.pitch * y);
+
+ do {
+ const uint8 px = *dstp;
+ const sint32 a = *alpha++;
+
+ *dstp++ = px + (((sint32)(color - px) * a + 512) >> 10);
+ } while(--w);
+ }
+}
+
+bool VDPixmapFillRegionAntialiased_32x_32x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+ if (dst.format != nsVDPixmap::kPixFormat_Y8)
+ return false;
+
+ // fast out
+ if (region.mSpans.empty())
+ return true;
+
+ // check if vertical clipping is required
+ const size_t n = region.mSpans.size();
+ uint32 start = 0;
+ uint32 end = n;
+
+ uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+ if (region.mSpans.front() < spanmin) {
+ // find first span : x2 > spanmin
+ start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+ start &= ~1;
+
+ // check for total top clip
+ if (start >= n)
+ return true;
+ }
+
+ uint32 spanlimit = (dst.w*32 - x) + (((dst.h*32 - y) - 1) << 16) + 0x80008000;
+
+ if (region.mSpans.back() > spanlimit) {
+ // find last span : x1 < spanlimit
+ end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+ end = (end + 1) & ~1;
+
+ // check for total bottom clip
+ if (start >= end)
+ return true;
+ }
+
+ // allocate A-buffer
+ vdfastvector<uint16> abuffer(dst.w, 0);
+
+ // fill region
+ const uint32 *pSpan = &region.mSpans[start];
+ const uint32 *pEnd = &region.mSpans[0] + end;
+ int lasty = -1;
+
+ sint32 dstw32 = dst.w * 32;
+ sint32 dsth32 = dst.h * 32;
+
+ for(; pSpan != pEnd; pSpan += 2) {
+ uint32 span0 = pSpan[0];
+ uint32 span1 = pSpan[1];
+
+ sint32 py = (span0 >> 16) - 0x8000 + y;
+
+ if ((uint32)py >= (uint32)dsth32)
+ continue;
+
+ sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+ sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+ sint32 w = span1-span0;
+
+ if (lasty != py) {
+ if (((lasty ^ py) & 0xFFFFFFE0)) {
+ if (lasty >= 0) {
+ // flush scanline
+
+ RenderABuffer8_1024(dst, lasty >> 5, abuffer.data(), dst.w, color);
+ }
+
+ memset(abuffer.data(), 0, abuffer.size() * sizeof(abuffer[0]));
+ }
+ lasty = py;
+ }
+
+ if (px1 < 0)
+ px1 = 0;
+ if (px2 > dstw32)
+ px2 = dstw32;
+
+ if (px1 >= px2)
+ continue;
+
+ uint32 ix1 = px1 >> 5;
+ uint32 ix2 = px2 >> 5;
+ uint16 *p1 = abuffer.data() + ix1;
+ uint16 *p2 = abuffer.data() + ix2;
+
+ if (p1 == p2) {
+ p1[0] += (px2 - px1);
+ } else {
+ if (px1 & 31) {
+ p1[0] += 32 - (px1 & 31);
+ ++p1;
+ }
+
+ while(p1 != p2) {
+ p1[0] += 32;
+ ++p1;
+ }
+
+ if (px2 & 31)
+ p1[0] += px2 & 32;
+ }
+ }
+
+ if (lasty >= 0)
+ RenderABuffer8_1024(dst, lasty >> 5, abuffer.data(), dst.w, color);
+
+ return true;
+}
+
+bool VDPixmapFillRegionAntialiased_16x_16x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+ if (dst.format != nsVDPixmap::kPixFormat_Y8)
+ return false;
+
+ // fast out
+ if (region.mSpans.empty())
+ return true;
+
+ // check if vertical clipping is required
+ const size_t n = region.mSpans.size();
+ uint32 start = 0;
+ uint32 end = n;
+
+ uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+ if (region.mSpans.front() < spanmin) {
+ // find first span : x2 > spanmin
+ start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+ start &= ~1;
+
+ // check for total top clip
+ if (start >= n)
+ return true;
+ }
+
+ uint32 spanlimit = (dst.w*16 - x) + (((dst.h*16 - y) - 1) << 16) + 0x80008000;
+
+ if (region.mSpans.back() > spanlimit) {
+ // find last span : x1 < spanlimit
+ end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+ end = (end + 1) & ~1;
+
+ // check for total bottom clip
+ if (start >= end)
+ return true;
+ }
+
+ // allocate A-buffer
+ vdfastvector<uint16> abuffer(dst.w, 0);
+
+ // fill region
+ const uint32 *pSpan = &region.mSpans[start];
+ const uint32 *pEnd = &region.mSpans[0] + end;
+ int lasty = -1;
+
+ sint32 dstw16 = dst.w * 16;
+ sint32 dsth16 = dst.h * 16;
+
+ for(; pSpan != pEnd; pSpan += 2) {
+ uint32 span0 = pSpan[0];
+ uint32 span1 = pSpan[1];
+
+ sint32 py = (span0 >> 16) - 0x8000 + y;
+
+ if ((uint32)py >= (uint32)dsth16)
+ continue;
+
+ sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+ sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+ sint32 w = span1-span0;
+
+ if (lasty != py) {
+ if (((lasty ^ py) & 0xFFFFFFF0)) {
+ if (lasty >= 0) {
+ // flush scanline
+
+ RenderABuffer8_256(dst, lasty >> 4, abuffer.data(), dst.w, color);
+ }
+
+ memset(abuffer.data(), 0, abuffer.size() * sizeof(abuffer[0]));
+ }
+ lasty = py;
+ }
+
+ if (px1 < 0)
+ px1 = 0;
+ if (px2 > dstw16)
+ px2 = dstw16;
+
+ if (px1 >= px2)
+ continue;
+
+ uint32 ix1 = px1 >> 4;
+ uint32 ix2 = px2 >> 4;
+ uint16 *p1 = abuffer.data() + ix1;
+ uint16 *p2 = abuffer.data() + ix2;
+
+ if (p1 == p2) {
+ p1[0] += (px2 - px1);
+ } else {
+ if (px1 & 15) {
+ p1[0] += 16 - (px1 & 15);
+ ++p1;
+ }
+
+ while(p1 != p2) {
+ p1[0] += 16;
+ ++p1;
+ }
+
+ if (px2 & 15)
+ p1[0] += px2 & 15;
+ }
+ }
+
+ if (lasty >= 0)
+ RenderABuffer8_256(dst, lasty >> 4, abuffer.data(), dst.w, color);
+
+ return true;
+}
+
+bool VDPixmapFillRegionAntialiased_16x_8x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+ if (dst.format != nsVDPixmap::kPixFormat_XRGB8888 && dst.format != nsVDPixmap::kPixFormat_Y8)
+ return false;
+
+ // fast out
+ if (region.mSpans.empty())
+ return true;
+
+ // check if vertical clipping is required
+ const size_t n = region.mSpans.size();
+ uint32 start = 0;
+ uint32 end = n;
+
+ uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+ if (region.mSpans.front() < spanmin) {
+ // find first span : x2 > spanmin
+ start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+ start &= ~1;
+
+ // check for total top clip
+ if (start >= n)
+ return true;
+ }
+
+ uint32 spanlimit = (dst.w*16 - x) + (((dst.h*8 - y) - 1) << 16) + 0x80008000;
+
+ if (region.mSpans.back() > spanlimit) {
+ // find last span : x1 < spanlimit
+ end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+ end = (end + 1) & ~1;
+
+ // check for total bottom clip
+ if (start >= end)
+ return true;
+ }
+
+ // allocate A-buffer
+ vdfastvector<uint8> abuffer(dst.w, 0);
+
+ // fill region
+ const uint32 *pSpan = &region.mSpans[start];
+ const uint32 *pEnd = &region.mSpans[0] + end;
+ int lasty = -1;
+
+ sint32 dstw16 = dst.w * 16;
+ sint32 dsth8 = dst.h * 8;
+
+ for(; pSpan != pEnd; pSpan += 2) {
+ uint32 span0 = pSpan[0];
+ uint32 span1 = pSpan[1];
+
+ sint32 py = (span0 >> 16) - 0x8000 + y;
+
+ if ((uint32)py >= (uint32)dsth8)
+ continue;
+
+ sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+ sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+ sint32 w = span1-span0;
+
+ if (lasty != py) {
+ if (((lasty ^ py) & 0xFFFFFFF8)) {
+ if (lasty >= 0) {
+ // flush scanline
+
+ RenderABuffer8_128(dst, lasty >> 3, abuffer.data(), dst.w, color);
+ }
+
+ memset(abuffer.data(), 0, abuffer.size());
+ }
+ lasty = py;
+ }
+
+ if (px1 < 0)
+ px1 = 0;
+ if (px2 > dstw16)
+ px2 = dstw16;
+
+ if (px1 >= px2)
+ continue;
+
+ uint32 ix1 = px1 >> 4;
+ uint32 ix2 = px2 >> 4;
+ uint8 *p1 = abuffer.data() + ix1;
+ uint8 *p2 = abuffer.data() + ix2;
+
+ if (p1 == p2) {
+ p1[0] += (px2 - px1);
+ } else {
+ if (px1 & 15) {
+ p1[0] += 16 - (px1 & 15);
+ ++p1;
+ }
+
+ while(p1 != p2) {
+ p1[0] += 16;
+ ++p1;
+ }
+
+ if (px2 & 15)
+ p1[0] += px2 & 15;
+ }
+ }
+
+ if (lasty >= 0)
+ RenderABuffer8_128(dst, lasty >> 3, abuffer.data(), dst.w, color);
+
+ return true;
+}
+
+bool VDPixmapFillRegionAntialiased8x(const VDPixmap& dst, const VDPixmapRegion& region, int x, int y, uint32 color) {
+ if (dst.format == nsVDPixmap::kPixFormat_YUV444_Planar ||
+ dst.format == nsVDPixmap::kPixFormat_YUV422_Planar ||
+ dst.format == nsVDPixmap::kPixFormat_YUV420_Planar ||
+ dst.format == nsVDPixmap::kPixFormat_YUV410_Planar) {
+ VDPixmap pxY;
+ VDPixmap pxCb;
+ VDPixmap pxCr;
+
+ pxY.format = nsVDPixmap::kPixFormat_Y8;
+ pxY.data = dst.data;
+ pxY.pitch = dst.pitch;
+ pxY.w = dst.w;
+ pxY.h = dst.h;
+
+ pxCb.format = nsVDPixmap::kPixFormat_Y8;
+ pxCb.data = dst.data2;
+ pxCb.pitch = dst.pitch2;
+ pxCb.w = dst.w;
+ pxCb.h = dst.h;
+
+ pxCr.format = nsVDPixmap::kPixFormat_Y8;
+ pxCr.data = dst.data3;
+ pxCr.pitch = dst.pitch3;
+ pxCr.w = dst.w;
+ pxCr.h = dst.h;
+
+ uint32 colorY = (color >> 8) & 0xff;
+ uint32 colorCb = (color >> 0) & 0xff;
+ uint32 colorCr = (color >> 16) & 0xff;
+
+ VDPixmapFillRegionAntialiased8x(pxY, region, x, y, colorY);
+
+ switch(dst.format) {
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ pxCr.w = pxCb.w = dst.w >> 2;
+ pxCr.h = pxCb.h = dst.h >> 2;
+ x >>= 2;
+ y >>= 2;
+ VDPixmapFillRegionAntialiased_32x_32x(pxCb, region, x, y, colorCb);
+ VDPixmapFillRegionAntialiased_32x_32x(pxCr, region, x, y, colorCr);
+ return true;
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ pxCr.w = pxCb.w = dst.w >> 1;
+ pxCr.h = pxCb.h = dst.h >> 1;
+ x >>= 1;
+ y >>= 1;
+ x += 2;
+ VDPixmapFillRegionAntialiased_16x_16x(pxCb, region, x, y, colorCb);
+ VDPixmapFillRegionAntialiased_16x_16x(pxCr, region, x, y, colorCr);
+ return true;
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ pxCr.w = pxCb.w = dst.w >> 1;
+ x >>= 1;
+ x += 2;
+ VDPixmapFillRegionAntialiased_16x_8x(pxCb, region, x, y, colorCb);
+ VDPixmapFillRegionAntialiased_16x_8x(pxCr, region, x, y, colorCr);
+ return true;
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ VDPixmapFillRegionAntialiased8x(pxCb, region, x, y, colorCb);
+ VDPixmapFillRegionAntialiased8x(pxCr, region, x, y, colorCr);
+ return true;
+ }
+ }
+
+ if (dst.format != nsVDPixmap::kPixFormat_XRGB8888 && dst.format != nsVDPixmap::kPixFormat_Y8)
+ return false;
+
+ // fast out
+ if (region.mSpans.empty())
+ return true;
+
+ // check if vertical clipping is required
+ const size_t n = region.mSpans.size();
+ uint32 start = 0;
+ uint32 end = n;
+
+ uint32 spanmin = -x + (-y << 16) + 0x80008000;
+
+ if (region.mSpans.front() < spanmin) {
+ // find first span : x2 > spanmin
+ start = std::upper_bound(region.mSpans.begin(), region.mSpans.end(), spanmin) - region.mSpans.begin();
+ start &= ~1;
+
+ // check for total top clip
+ if (start >= n)
+ return true;
+ }
+
+ uint32 spanlimit = (dst.w*8 - x) + (((dst.h*8 - y) - 1) << 16) + 0x80008000;
+
+ if (region.mSpans.back() > spanlimit) {
+ // find last span : x1 < spanlimit
+ end = std::lower_bound(region.mSpans.begin(), region.mSpans.end(), spanlimit) - region.mSpans.begin();
+
+ end = (end + 1) & ~1;
+
+ // check for total bottom clip
+ if (start >= end)
+ return true;
+ }
+
+ // allocate A-buffer
+ vdfastvector<uint8> abuffer(dst.w, 0);
+
+ // fill region
+ const uint32 *pSpan = &region.mSpans[start];
+ const uint32 *pEnd = &region.mSpans[0] + end;
+ int lasty = -1;
+
+ sint32 dstw8 = dst.w * 8;
+ sint32 dsth8 = dst.h * 8;
+
+ for(; pSpan != pEnd; pSpan += 2) {
+ uint32 span0 = pSpan[0];
+ uint32 span1 = pSpan[1];
+
+ sint32 py = (span0 >> 16) - 0x8000 + y;
+
+ if ((uint32)py >= (uint32)dsth8)
+ continue;
+
+ sint32 px1 = (span0 & 0xffff) - 0x8000 + x;
+ sint32 px2 = (span1 & 0xffff) - 0x8000 + x;
+ sint32 w = span1-span0;
+
+ if (lasty != py) {
+ if (((lasty ^ py) & 0xFFFFFFF8)) {
+ if (lasty >= 0) {
+ // flush scanline
+
+ if (dst.format == nsVDPixmap::kPixFormat_XRGB8888)
+ RenderABuffer32(dst, lasty >> 3, abuffer.data(), dst.w, color);
+ else
+ RenderABuffer8(dst, lasty >> 3, abuffer.data(), dst.w, color);
+ }
+
+ memset(abuffer.data(), 0, abuffer.size());
+ }
+ lasty = py;
+ }
+
+ if (px1 < 0)
+ px1 = 0;
+ if (px2 > dstw8)
+ px2 = dstw8;
+
+ if (px1 >= px2)
+ continue;
+
+ uint32 ix1 = px1 >> 3;
+ uint32 ix2 = px2 >> 3;
+ uint8 *p1 = abuffer.data() + ix1;
+ uint8 *p2 = abuffer.data() + ix2;
+
+ if (p1 == p2) {
+ p1[0] += (px2 - px1);
+ } else {
+ if (px1 & 7) {
+ p1[0] += 8 - (px1 & 7);
+ ++p1;
+ }
+
+ while(p1 != p2) {
+ p1[0] += 8;
+ ++p1;
+ }
+
+ if (px2 & 7)
+ p1[0] += px2 & 7;
+ }
+ }
+
+ if (lasty >= 0) {
+ if (dst.format == nsVDPixmap::kPixFormat_XRGB8888)
+ RenderABuffer32(dst, lasty >> 3, abuffer.data(), dst.w, color);
+ else
+ RenderABuffer8(dst, lasty >> 3, abuffer.data(), dst.w, color);
+ }
+
+ return true;
+}
+
+void VDPixmapCreateRoundRegion(VDPixmapRegion& dst, float r) {
+ int ir = VDCeilToInt(r);
+ float r2 = r*r;
+
+ dst.mSpans.clear();
+ dst.mBounds.set(-ir, 0, ir+1, 0);
+
+ for(int y = -ir; y <= ir; ++y) {
+ int dx = VDCeilToInt(sqrtf(r2 - y*y));
+
+ if (dx > 0) {
+ dst.mSpans.push_back(0x80008000 + (y << 16) - dx);
+ dst.mSpans.push_back(0x80008001 + (y << 16) + dx);
+ if (dst.mBounds.top > y)
+ dst.mBounds.top = y;
+ if (dst.mBounds.bottom < y)
+ dst.mBounds.bottom = y;
+ }
+ }
+}
+
+void VDPixmapConvolveRegion(VDPixmapRegion& dst, const VDPixmapRegion& r1, const VDPixmapRegion& r2, int dx1, int dx2, int dy) {
+ dst.mSpans.clear();
+ dst.mSpans.resize(r1.mSpans.size()+r2.mSpans.size());
+
+ const uint32 *itA = r1.mSpans.data();
+ const uint32 *itAE = itA + r1.mSpans.size();
+ const uint32 *itB = r2.mSpans.data();
+ const uint32 *itBE = itB + r2.mSpans.size();
+ uint32 *dstp0 = dst.mSpans.data();
+ uint32 *dstp = dst.mSpans.data();
+
+ uint32 offset1 = (dy<<16) + dx1;
+ uint32 offset2 = (dy<<16) + dx2;
+
+ while(itA != itAE && itB != itBE) {
+ uint32 x1;
+ uint32 x2;
+
+ if (itB[0] + offset1 < itA[0]) {
+ // B span is earlier. Use it.
+ x1 = itB[0] + offset1;
+ x2 = itB[1] + offset2;
+ itB += 2;
+
+ // B spans *can* overlap, due to the widening.
+ while(itB != itBE && itB[0]+offset1 <= x2) {
+ uint32 bx2 = itB[1] + offset2;
+ if (x2 < bx2)
+ x2 = bx2;
+
+ itB += 2;
+ }
+
+ goto a_start;
+ } else {
+ // A span is earlier. Use it.
+ x1 = itA[0];
+ x2 = itA[1];
+ itA += 2;
+
+ // A spans don't overlap, so begin merge loop with B first.
+ }
+
+ for(;;) {
+ // If we run out of B spans or the B span doesn't overlap,
+ // then the next A span can't either (because A spans don't
+ // overlap) and we exit.
+
+ if (itB == itBE || itB[0]+offset1 > x2)
+ break;
+
+ do {
+ uint32 bx2 = itB[1] + offset2;
+ if (x2 < bx2)
+ x2 = bx2;
+
+ itB += 2;
+ } while(itB != itBE && itB[0]+offset1 <= x2);
+
+ // If we run out of A spans or the A span doesn't overlap,
+ // then the next B span can't either, because we would have
+ // consumed all overlapping B spans in the above loop.
+a_start:
+ if (itA == itAE || itA[0] > x2)
+ break;
+
+ do {
+ uint32 ax2 = itA[1];
+ if (x2 < ax2)
+ x2 = ax2;
+
+ itA += 2;
+ } while(itA != itAE && itA[0] <= x2);
+ }
+
+ // Flush span.
+ dstp[0] = x1;
+ dstp[1] = x2;
+ dstp += 2;
+ }
+
+ // Copy over leftover spans.
+ memcpy(dstp, itA, sizeof(uint32)*(itAE - itA));
+ dstp += itAE - itA;
+
+ while(itB != itBE) {
+ // B span is earlier. Use it.
+ uint32 x1 = itB[0] + offset1;
+ uint32 x2 = itB[1] + offset2;
+ itB += 2;
+
+ // B spans *can* overlap, due to the widening.
+ while(itB != itBE && itB[0]+offset1 <= x2) {
+ uint32 bx2 = itB[1] + offset2;
+ if (x2 < bx2)
+ x2 = bx2;
+
+ itB += 2;
+ }
+
+ dstp[0] = x1;
+ dstp[1] = x2;
+ dstp += 2;
+ }
+
+ dst.mSpans.resize(dstp - dst.mSpans.data());
+}
+
+void VDPixmapConvolveRegion(VDPixmapRegion& dst, const VDPixmapRegion& r1, const VDPixmapRegion& r2) {
+ VDPixmapRegion temp;
+
+ const uint32 *src1 = r2.mSpans.data();
+ const uint32 *src2 = src1 + r2.mSpans.size();
+
+ dst.mSpans.clear();
+ while(src1 != src2) {
+ uint32 p1 = src1[0];
+ uint32 p2 = src1[1];
+ src1 += 2;
+
+ temp.mSpans.swap(dst.mSpans);
+ VDPixmapConvolveRegion(dst, temp, r1, (p1 & 0xffff) - 0x8000, (p2 & 0xffff) - 0x8000, (p1 >> 16) - 0x8000);
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp
new file mode 100644
index 000000000..4d1aef5f5
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample.cpp
@@ -0,0 +1,348 @@
+// VirtualDub - Video processing and capture application
+// Graphics support library
+// Copyright (C) 1998-2004 Avery Lee
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <float.h>
+#include <math.h>
+#include <vd2/system/vdalloc.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/resample.h>
+#include "uberblit_gen.h"
+
+///////////////////////////////////////////////////////////////////////////
+//
+// the resampler (finally)
+//
+///////////////////////////////////////////////////////////////////////////
+
+class VDPixmapResampler : public IVDPixmapResampler {
+public:
+ VDPixmapResampler();
+ ~VDPixmapResampler();
+
+ void SetSplineFactor(double A) { mSplineFactor = A; }
+ void SetFilters(FilterMode h, FilterMode v, bool interpolationOnly);
+ bool Init(uint32 dw, uint32 dh, int dstformat, uint32 sw, uint32 sh, int srcformat);
+ bool Init(const vdrect32f& dstrect, uint32 dw, uint32 dh, int dstformat, const vdrect32f& srcrect, uint32 sw, uint32 sh, int srcformat);
+ void Shutdown();
+
+ void Process(const VDPixmap& dst, const VDPixmap& src);
+
+protected:
+ void ApplyFilters(VDPixmapUberBlitterGenerator& gen, uint32 dw, uint32 dh, float xoffset, float yoffset, float xfactor, float yfactor);
+
+ vdautoptr<IVDPixmapBlitter> mpBlitter;
+ vdautoptr<IVDPixmapBlitter> mpBlitter2;
+ double mSplineFactor;
+ FilterMode mFilterH;
+ FilterMode mFilterV;
+ bool mbInterpOnly;
+
+ vdrect32 mDstRectPlane0;
+ vdrect32 mDstRectPlane12;
+};
+
+IVDPixmapResampler *VDCreatePixmapResampler() { return new VDPixmapResampler; }
+
+VDPixmapResampler::VDPixmapResampler()
+ : mSplineFactor(-0.6)
+ , mFilterH(kFilterCubic)
+ , mFilterV(kFilterCubic)
+ , mbInterpOnly(false)
+{
+}
+
+VDPixmapResampler::~VDPixmapResampler() {
+ Shutdown();
+}
+
+void VDPixmapResampler::SetFilters(FilterMode h, FilterMode v, bool interpolationOnly) {
+ mFilterH = h;
+ mFilterV = v;
+ mbInterpOnly = interpolationOnly;
+}
+
+bool VDPixmapResampler::Init(uint32 dw, uint32 dh, int dstformat, uint32 sw, uint32 sh, int srcformat) {
+ vdrect32f rSrc(0.0f, 0.0f, (float)sw, (float)sh);
+ vdrect32f rDst(0.0f, 0.0f, (float)dw, (float)dh);
+ return Init(rDst, dw, dh, dstformat, rSrc, sw, sh, srcformat);
+}
+
+bool VDPixmapResampler::Init(const vdrect32f& dstrect0, uint32 dw, uint32 dh, int dstformat, const vdrect32f& srcrect0, uint32 sw, uint32 sh, int srcformat) {
+ Shutdown();
+
+ if (dstformat != srcformat || (
+ srcformat != nsVDPixmap::kPixFormat_XRGB8888 &&
+ srcformat != nsVDPixmap::kPixFormat_Y8 &&
+ srcformat != nsVDPixmap::kPixFormat_YUV444_Planar &&
+ srcformat != nsVDPixmap::kPixFormat_YUV422_Planar &&
+ srcformat != nsVDPixmap::kPixFormat_YUV420_Planar &&
+ srcformat != nsVDPixmap::kPixFormat_YUV411_Planar &&
+ srcformat != nsVDPixmap::kPixFormat_YUV410_Planar
+ ))
+ return false;
+
+ // convert destination flips to source flips
+ vdrect32f dstrect(dstrect0);
+ vdrect32f srcrect(srcrect0);
+
+ if (dstrect.left > dstrect.right) {
+ std::swap(dstrect.left, dstrect.right);
+ std::swap(srcrect.left, srcrect.right);
+ }
+
+ if (dstrect.top > dstrect.bottom) {
+ std::swap(dstrect.top, dstrect.bottom);
+ std::swap(srcrect.top, srcrect.bottom);
+ }
+
+ // compute source step factors
+ float xfactor = (float)srcrect.width() / (float)dstrect.width();
+ float yfactor = (float)srcrect.height() / (float)dstrect.height();
+
+ // clip destination rect
+ if (dstrect.left < 0) {
+ float clipx1 = -dstrect.left;
+ srcrect.left += xfactor * clipx1;
+ dstrect.left = 0.0f;
+ }
+
+ if (dstrect.top < 0) {
+ float clipy1 = -dstrect.top;
+ srcrect.top += yfactor * clipy1;
+ dstrect.top = 0.0f;
+ }
+
+ float clipx2 = dstrect.right - (float)dw;
+ if (clipx2 > 0) {
+ srcrect.right -= xfactor * clipx2;
+ dstrect.right = (float)dw;
+ }
+
+ float clipy2 = dstrect.bottom - (float)dh;
+ if (clipy2 > 0) {
+ srcrect.bottom -= yfactor * clipy2;
+ dstrect.bottom = (float)dh;
+ }
+
+ // compute plane 0 dest rect in integral quanta
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dstformat);
+ mDstRectPlane0.left = VDCeilToInt(dstrect.left - 0.5f);
+ mDstRectPlane0.top = VDCeilToInt(dstrect.top - 0.5f);
+ mDstRectPlane0.right = VDCeilToInt(dstrect.right - 0.5f);
+ mDstRectPlane0.bottom = VDCeilToInt(dstrect.bottom - 0.5f);
+
+ // compute plane 0 stepping parameters
+ float xoffset = (((float)mDstRectPlane0.left + 0.5f) - dstrect.left) * xfactor + srcrect.left;
+ float yoffset = (((float)mDstRectPlane0.top + 0.5f) - dstrect.top ) * yfactor + srcrect.top;
+
+ // compute plane 1/2 dest rect and stepping parameters
+ float xoffset2 = 0.0f;
+ float yoffset2 = 0.0f;
+
+ if (formatInfo.auxbufs > 0) {
+ float xf2 = (float)(1 << formatInfo.auxwbits);
+ float yf2 = (float)(1 << formatInfo.auxhbits);
+ float invxf2 = 1.0f / xf2;
+ float invyf2 = 1.0f / yf2;
+
+ // convert source and dest rects to plane 1/2 space
+ vdrect32f srcrect2(srcrect);
+ vdrect32f dstrect2(dstrect);
+
+ srcrect2.scale(invxf2, invyf2);
+ dstrect2.scale(invxf2, invyf2);
+
+ switch(srcformat) {
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ break;
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ srcrect2.translate(0.25f, 0.0f);
+ dstrect2.translate(0.25f, 0.0f);
+ break;
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ srcrect2.translate(0.25f, 0.0f);
+ dstrect2.translate(0.25f, 0.0f);
+ break;
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ srcrect2.translate(0.375f, 0.0f);
+ dstrect2.translate(0.375f, 0.0f);
+ break;
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ break;
+ default:
+ VDASSERT(false);
+ }
+
+ mDstRectPlane12.left = VDCeilToInt(dstrect2.left - 0.5f);
+ mDstRectPlane12.top = VDCeilToInt(dstrect2.top - 0.5f);
+ mDstRectPlane12.right = VDCeilToInt(dstrect2.right - 0.5f);
+ mDstRectPlane12.bottom = VDCeilToInt(dstrect2.bottom - 0.5f);
+
+ xoffset2 = (((float)mDstRectPlane12.left + 0.5f) - dstrect2.left) * xfactor + srcrect2.left;
+ yoffset2 = (((float)mDstRectPlane12.top + 0.5f) - dstrect2.top ) * yfactor + srcrect2.top;
+ }
+
+ VDPixmapUberBlitterGenerator gen;
+
+ switch(srcformat) {
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ gen.ldsrc(0, 0, 0, 0, sw, sh, VDPixmapGetFormatTokenFromFormat(srcformat), sw*4);
+ ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+ break;
+
+ case nsVDPixmap::kPixFormat_Y8:
+ gen.ldsrc(0, 0, 0, 0, sw, sh, kVDPixType_8, sw);
+ ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ gen.ldsrc(0, 0, 0, 0, sw, sh, kVDPixType_8, sw);
+ ApplyFilters(gen, mDstRectPlane0.width(), mDstRectPlane0.height(), xoffset, yoffset, xfactor, yfactor);
+
+ {
+ const VDPixmapFormatInfo& info = VDPixmapGetInfo(dstformat);
+ uint32 subsw = -(-(sint32)sw >> info.auxwbits);
+ uint32 subsh = -(-(sint32)sh >> info.auxhbits);
+
+ VDPixmapUberBlitterGenerator gen2;
+ gen2.ldsrc(0, 0, 0, 0, subsw, subsh, kVDPixType_8, subsw);
+ ApplyFilters(gen2, mDstRectPlane12.width(), mDstRectPlane12.height(), xoffset2, yoffset2, xfactor, yfactor);
+ mpBlitter2 = gen2.create();
+ if (!mpBlitter2)
+ return false;
+ }
+ break;
+ }
+
+ mpBlitter = gen.create();
+ if (!mpBlitter)
+ return false;
+
+ return true;
+}
+
+void VDPixmapResampler::Shutdown() {
+ mpBlitter = NULL;
+ mpBlitter2 = NULL;
+}
+
+void VDPixmapResampler::Process(const VDPixmap& dst, const VDPixmap& src) {
+ if (!mpBlitter)
+ return;
+
+ switch(dst.format) {
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ case nsVDPixmap::kPixFormat_Y8:
+ mpBlitter->Blit(dst, &mDstRectPlane0, src);
+ break;
+
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ case nsVDPixmap::kPixFormat_YUV411_Planar:
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ // blit primary plane
+ mpBlitter->Blit(dst, &mDstRectPlane0, src);
+
+ // slice and blit secondary planes
+ {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+ VDPixmap pxdst;
+ pxdst.format = nsVDPixmap::kPixFormat_Y8;
+ pxdst.w = -(-dst.w >> formatInfo.auxwbits);
+ pxdst.h = -(-dst.h >> formatInfo.auxhbits);
+ pxdst.pitch = dst.pitch2;
+ pxdst.data = dst.data2;
+
+ VDPixmap pxsrc;
+ pxsrc.format = nsVDPixmap::kPixFormat_Y8;
+ pxsrc.w = -(-src.w >> formatInfo.auxwbits);
+ pxsrc.h = -(-src.h >> formatInfo.auxhbits);
+ pxsrc.pitch = src.pitch2;
+ pxsrc.data = src.data2;
+
+ mpBlitter2->Blit(pxdst, &mDstRectPlane12, pxsrc);
+
+ pxdst.pitch = dst.pitch3;
+ pxdst.data = dst.data3;
+ pxsrc.pitch = src.pitch3;
+ pxsrc.data = src.data3;
+ mpBlitter2->Blit(pxdst, &mDstRectPlane12, pxsrc);
+ }
+ break;
+ }
+}
+
+void VDPixmapResampler::ApplyFilters(VDPixmapUberBlitterGenerator& gen, uint32 dw, uint32 dh, float xoffset, float yoffset, float xfactor, float yfactor) {
+ switch(mFilterH) {
+ case kFilterPoint:
+ gen.pointh(xoffset, xfactor, dw);
+ break;
+
+ case kFilterLinear:
+ gen.linearh(xoffset, xfactor, dw, mbInterpOnly);
+ break;
+
+ case kFilterCubic:
+ gen.cubich(xoffset, xfactor, dw, (float)mSplineFactor, mbInterpOnly);
+ break;
+
+ case kFilterLanczos3:
+ gen.lanczos3h(xoffset, xfactor, dw);
+ break;
+ }
+
+ switch(mFilterV) {
+ case kFilterPoint:
+ gen.pointv(yoffset, yfactor, dh);
+ break;
+
+ case kFilterLinear:
+ gen.linearv(yoffset, yfactor, dh, mbInterpOnly);
+ break;
+
+ case kFilterCubic:
+ gen.cubicv(yoffset, yfactor, dh, (float)mSplineFactor, mbInterpOnly);
+ break;
+
+ case kFilterLanczos3:
+ gen.lanczos3v(yoffset, yfactor, dh);
+ break;
+ }
+}
+
+bool VDPixmapResample(const VDPixmap& dst, const VDPixmap& src, IVDPixmapResampler::FilterMode filter) {
+ VDPixmapResampler r;
+
+ r.SetFilters(filter, filter, false);
+
+ if (!r.Init(dst.w, dst.h, dst.format, src.w, src.h, src.format))
+ return false;
+
+ r.Process(dst, src);
+ return true;
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp
new file mode 100644
index 000000000..010364e1a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_kernels.cpp
@@ -0,0 +1,255 @@
+#include <math.h>
+#include <vd2/Kasumi/resample_kernels.h>
+
+///////////////////////////////////////////////////////////////////////////
+//
+// utility functions
+//
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+ inline sint32 scale32x32_fp16(sint32 x, sint32 y) {
+ return (sint32)(((sint64)x * y + 0x8000) >> 16);
+ }
+
+ inline double sinc(double x) {
+ return fabs(x) < 1e-9 ? 1.0 : sin(x) / x;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerAxis
+//
+///////////////////////////////////////////////////////////////////////////
+
+void VDResamplerAxis::Init(sint32 dudx) {
+ this->dudx = dudx;
+}
+
+void VDResamplerAxis::Compute(sint32 count, sint32 u0, sint32 w, sint32 kernel_width) {
+ u = u0;
+ dx = count;
+
+ sint32 du_kern = (kernel_width-1) << 16;
+ sint32 u2 = u + dudx*(dx-1);
+ sint32 u_limit = w << 16;
+
+ dx_precopy = 0;
+ dx_preclip = 0;
+ dx_active = 0;
+ dx_postclip = 0;
+ dx_postcopy = 0;
+ dx_dualclip = 0;
+
+ sint32 dx_temp = dx;
+ sint32 u_start = u;
+
+ // (desired - u0 + (dudx-1)) / dudx : first pixel >= desired
+
+ sint32 dudx_m1_mu0 = dudx - 1 - u;
+ sint32 first_preclip = (dudx_m1_mu0 + 0x10000 - du_kern) / dudx;
+ sint32 first_active = (dudx_m1_mu0 ) / dudx;
+ sint32 first_postclip = (dudx_m1_mu0 + u_limit - du_kern) / dudx;
+ sint32 first_postcopy = (dudx_m1_mu0 + u_limit - 0x10000) / dudx;
+
+ // clamp
+ if (first_preclip < 0)
+ first_preclip = 0;
+ if (first_active < first_preclip)
+ first_active = first_preclip;
+ if (first_postclip < first_active)
+ first_postclip = first_active;
+ if (first_postcopy < first_postclip)
+ first_postcopy = first_postclip;
+ if (first_preclip > dx)
+ first_preclip = dx;
+ if (first_active > dx)
+ first_active = dx;
+ if (first_postclip > dx)
+ first_postclip = dx;
+ if (first_postcopy > dx)
+ first_postcopy = dx;
+
+ // determine widths
+
+ dx_precopy = first_preclip;
+ dx_preclip = first_active - first_preclip;
+ dx_active = first_postclip - first_active;
+ dx_postclip = first_postcopy - first_postclip;
+ dx_postcopy = dx - first_postcopy;
+
+ // sanity checks
+ sint32 pos0 = dx_precopy;
+ sint32 pos1 = pos0 + dx_preclip;
+ sint32 pos2 = pos1 + dx_active;
+ sint32 pos3 = pos2 + dx_postclip;
+
+ VDASSERT(!((dx_precopy|dx_preclip|dx_active|dx_postcopy|dx_postclip) & 0x80000000));
+ VDASSERT(dx_precopy + dx_preclip + dx_active + dx_postcopy + dx_postclip == dx);
+
+ VDASSERT(!pos0 || u_start + dudx*(pos0 - 1) < 0x10000 - du_kern); // precopy -> preclip
+ VDASSERT( pos0 >= pos1 || u_start + dudx*(pos0 ) >= 0x10000 - du_kern);
+ VDASSERT( pos1 <= pos0 || u_start + dudx*(pos1 - 1) < 0); // preclip -> active
+ VDASSERT( pos1 >= pos2 || u_start + dudx*(pos1 ) >= 0 || !dx_active);
+ VDASSERT( pos2 <= pos1 || u_start + dudx*(pos2 - 1) < u_limit - du_kern || !dx_active); // active -> postclip
+ VDASSERT( pos2 >= pos3 || u_start + dudx*(pos2 ) >= u_limit - du_kern);
+ VDASSERT( pos3 <= pos2 || u_start + dudx*(pos3 - 1) < u_limit - 0x10000); // postclip -> postcopy
+ VDASSERT( pos3 >= dx || u_start + dudx*(pos3 ) >= u_limit - 0x10000);
+
+ u += dx_precopy * dudx;
+
+ // test for overlapping clipping regions
+ if (!dx_active && kernel_width > w) {
+ dx_dualclip = dx_preclip + dx_postclip;
+ dx_preclip = dx_postclip = 0;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerLinearFilter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerLinearFilter::VDResamplerLinearFilter(double twofc)
+ : mScale(twofc)
+ , mTaps((int)ceil(1.0 / twofc) * 2)
+{
+}
+
+int VDResamplerLinearFilter::GetFilterWidth() const {
+ return mTaps;
+}
+
+double VDResamplerLinearFilter::EvaluateFilter(double t) const {
+ t = 1.0f - fabs(t)*mScale;
+
+ return t + fabs(t);
+}
+
+void VDResamplerLinearFilter::GenerateFilter(float *dst, double offset) const {
+ double pos = -((double)((mTaps>>1)-1) + offset) * mScale;
+
+ for(unsigned i=0; i<mTaps; ++i) {
+ double t = 1.0 - fabs(pos);
+
+ *dst++ = (float)(t+fabs(t));
+ pos += mScale;
+ }
+}
+
+void VDResamplerLinearFilter::GenerateFilterBank(float *dst) const {
+ for(int offset=0; offset<256; ++offset) {
+ GenerateFilter(dst, offset * (1.0f / 256.0f));
+ dst += mTaps;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerCubicFilter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerCubicFilter::VDResamplerCubicFilter(double twofc, double A)
+ : mScale(twofc)
+ , mA0( 1.0 )
+ , mA2(-3.0-A)
+ , mA3( 2.0+A)
+ , mB0(-4.0*A)
+ , mB1( 8.0*A)
+ , mB2(-5.0*A)
+ , mB3( A)
+ , mTaps((int)ceil(2.0 / twofc)*2)
+{
+}
+
+int VDResamplerCubicFilter::GetFilterWidth() const { return mTaps; }
+
+double VDResamplerCubicFilter::EvaluateFilter(double t) const {
+ t = fabs(t)*mScale;
+
+ if (t < 1.0)
+ return mA0 + (t*t)*(mA2 + t*mA3);
+ else if (t < 2.0)
+ return mB0 + t*(mB1 + t*(mB2 + t*mB3));
+ else
+ return 0;
+}
+
+void VDResamplerCubicFilter::GenerateFilter(float *dst, double offset) const {
+ double pos = -((double)((mTaps>>1)-1) + offset) * mScale;
+
+ for(unsigned i=0; i<mTaps; ++i) {
+ double t = fabs(pos);
+ double v = 0;
+
+ if (t < 1.0)
+ v = mA0 + (t*t)*(mA2 + t*mA3);
+ else if (t < 2.0)
+ v = mB0 + t*(mB1 + t*(mB2 + t*mB3));
+
+ *dst++ = (float)v;
+ pos += mScale;
+ }
+}
+
+void VDResamplerCubicFilter::GenerateFilterBank(float *dst) const {
+ for(int offset=0; offset<256; ++offset) {
+ GenerateFilter(dst, offset * (1.0f / 256.0f));
+ dst += mTaps;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDResamplerLanczos3Filter
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerLanczos3Filter::VDResamplerLanczos3Filter(double twofc)
+ : mScale(twofc)
+ , mTaps((int)ceil(3.0 / twofc)*2)
+{
+}
+
+int VDResamplerLanczos3Filter::GetFilterWidth() const {
+ return mTaps;
+}
+
+double VDResamplerLanczos3Filter::EvaluateFilter(double t) const {
+ static const double pi = 3.1415926535897932384626433832795; // pi
+ static const double pi3 = 1.0471975511965977461542144610932; // pi/3
+
+ t *= mScale;
+
+ if (fabs(t) < 3.0)
+ return sinc(pi*t) * sinc(pi3*t);
+ else
+ return 0.0;
+}
+
+void VDResamplerLanczos3Filter::GenerateFilter(float *dst, double offset) const {
+ static const double pi = 3.1415926535897932384626433832795; // pi
+ static const double pi3 = 1.0471975511965977461542144610932; // pi/3
+
+ double t = -(((double)((mTaps>>1)-1) + offset) * mScale);
+
+ for(unsigned i=0; i<mTaps; ++i) {
+ double v = 0;
+
+ if (fabs(t) < 3.0)
+ v = sinc(pi*t) * sinc(pi3*t);
+
+ *dst++ = (float)v;
+ t += mScale;
+ }
+}
+
+void VDResamplerLanczos3Filter::GenerateFilterBank(float *dst) const {
+ for(int offset=0; offset<256; ++offset) {
+ GenerateFilter(dst, offset * (1.0f / 256.0f));
+ dst += mTaps;
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp
new file mode 100644
index 000000000..fcea6c669
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages.cpp
@@ -0,0 +1,149 @@
+#include <vd2/system/math.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/Kasumi/resample_kernels.h>
+#include "resample_stages.h"
+
+VDSteppedAllocator::VDSteppedAllocator(size_t initialSize)
+ : mpHead(NULL)
+ , mpAllocNext(NULL)
+ , mAllocLeft(0)
+ , mAllocNext(initialSize)
+ , mAllocInit(initialSize)
+{
+}
+
+VDSteppedAllocator::~VDSteppedAllocator() {
+ clear();
+}
+
+void VDSteppedAllocator::clear() {
+ while(Block *p = mpHead) {
+ mpHead = mpHead->next;
+ free(p);
+ }
+ mAllocLeft = 0;
+ mAllocNext = mAllocInit;
+}
+
+void *VDSteppedAllocator::allocate(size_type n) {
+ n = (n+15) & ~15;
+ if (mAllocLeft < n) {
+ mAllocLeft = mAllocNext;
+ mAllocNext += (mAllocNext >> 1);
+ if (mAllocLeft < n)
+ mAllocLeft = n;
+
+ Block *t = (Block *)malloc(sizeof(Block) + mAllocLeft);
+
+ if (mpHead)
+ mpHead->next = t;
+
+ mpHead = t;
+ mpHead->next = NULL;
+
+ mpAllocNext = (char *)(mpHead + 1);
+ }
+
+ void *p = mpAllocNext;
+ mpAllocNext += n;
+ mAllocLeft -= n;
+ return p;
+}
+
+void VDResamplerGenerateTable(sint32 *dst, const IVDResamplerFilter& filter) {
+ const unsigned width = filter.GetFilterWidth();
+ vdblock<float> filters(width * 256);
+ float *src = filters.data();
+
+ filter.GenerateFilterBank(src);
+
+ for(unsigned phase=0; phase < 256; ++phase) {
+ float sum = 0;
+
+ for(unsigned i=0; i<width; ++i)
+ sum += src[i];
+
+ float scalefac = 16384.0f / sum;
+
+ for(unsigned j=0; j<width; j += 2) {
+ int v0 = VDRoundToIntFast(src[j+0] * scalefac);
+ int v1 = VDRoundToIntFast(src[j+1] * scalefac);
+
+ dst[j+0] = v0;
+ dst[j+1] = v1;
+ }
+
+ src += width;
+ dst += width;
+ }
+}
+
+void VDResamplerGenerateTableF(float *dst, const IVDResamplerFilter& filter) {
+ const unsigned width = filter.GetFilterWidth();
+ filter.GenerateFilterBank(dst);
+
+ for(unsigned phase=0; phase < 256; ++phase) {
+ float sum = 0;
+
+ for(unsigned i=0; i<width; ++i)
+ sum += dst[i];
+
+ float scalefac = 1.0f / sum;
+
+ for(unsigned j=0; j<width; ++j)
+ *dst++ *= scalefac;
+ }
+}
+
+void VDResamplerGenerateTable2(sint32 *dst, const IVDResamplerFilter& filter, sint32 count, sint32 u0, sint32 dudx) {
+ const unsigned width = filter.GetFilterWidth();
+ vdblock<float> filters(width);
+ float *src = filters.data();
+
+ filter.GenerateFilterBank(src);
+
+ for(sint32 i=0; i<count; ++i) {
+ sint32 u = u0 + dudx*i;
+
+ *dst++ = u >> 16;
+ filter.GenerateFilter(src, (double)(u & 0xffff) / 65536.0);
+
+ float sum = 0;
+ for(uint32 j=0; j<width; ++j)
+ sum += src[j];
+
+ float scalefac = 16384.0f / sum;
+
+ sint32 isum = 0;
+ for(uint32 j=0; j<width; ++j) {
+ sint32 v = VDRoundToIntFast(src[j] * scalefac);
+
+ dst[j] = v;
+ isum += v;
+ }
+
+ sint32 ierr = 16384 - isum;
+ sint32 idelta = 2*(ierr >> 31) - 1;
+ while(ierr) {
+ for(uint32 j=0; j<width && ierr; ++j) {
+ if (!dst[j])
+ continue;
+
+ dst[j] += idelta;
+ ierr -= idelta;
+ }
+ }
+
+ dst += width;
+ }
+}
+
+void VDResamplerSwizzleTable(sint32 *dst, unsigned pairs) {
+ do {
+ sint32 v0 = dst[0];
+ sint32 v1 = dst[1];
+
+ dst[0] = dst[1] = (v0 & 0xffff) + (v1<<16);
+ dst += 2;
+ } while(--pairs);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp
new file mode 100644
index 000000000..94bee7c9e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_reference.cpp
@@ -0,0 +1,425 @@
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "resample_stages_reference.h"
+#include <vd2/Kasumi/resample_kernels.h>
+#include "blt_spanutils.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int VDResamplerRowStageSeparablePoint8::GetWindowSize() const {
+ return 1;
+}
+
+void VDResamplerRowStageSeparablePoint8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ *dst++ = src[u>>16];
+ u += dudx;
+ } while(--w);
+}
+
+int VDResamplerRowStageSeparablePoint16::GetWindowSize() const {
+ return 1;
+}
+
+void VDResamplerRowStageSeparablePoint16::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint16 *dst = (uint16 *)dst0;
+ const uint16 *src = (const uint16 *)src0;
+
+ do {
+ *dst++ = src[u>>16];
+ u += dudx;
+ } while(--w);
+}
+
+int VDResamplerRowStageSeparablePoint32::GetWindowSize() const {
+ return 1;
+}
+
+void VDResamplerRowStageSeparablePoint32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+
+ do {
+ *dst++ = src[u>>16];
+ u += dudx;
+ } while(--w);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int VDResamplerRowStageSeparableLinear8::GetWindowSize() const {return 2;}
+void VDResamplerRowStageSeparableLinear8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ do {
+ const sint32 iu = u>>16;
+ const uint32 p0 = src[iu];
+ const uint32 p1 = src[iu+1];
+ const uint32 f = (u >> 8) & 0xff;
+
+ *dst++ = (uint8)(p0 + (((sint32)(p1 - p0)*f + 0x80)>>8));
+ u += dudx;
+ } while(--w);
+}
+
+void VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ VDASSERT(!u && dudx == 0x8000);
+
+ nsVDPixmapSpanUtils::horiz_expand2x_coaligned(dst, src, w);
+}
+
+int VDResamplerRowStageSeparableLinear32::GetWindowSize() const {return 2;}
+void VDResamplerRowStageSeparableLinear32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+
+ do {
+ const sint32 iu = u>>16;
+ const uint32 p0 = src[iu];
+ const uint32 p1 = src[iu+1];
+ const uint32 f = (u >> 8) & 0xff;
+
+ const uint32 p0_rb = p0 & 0xff00ff;
+ const uint32 p1_rb = p1 & 0xff00ff;
+ const uint32 p0_g = p0 & 0xff00;
+ const uint32 p1_g = p1 & 0xff00;
+
+ *dst++ = ((p0_rb + (((p1_rb - p0_rb)*f + 0x800080)>>8)) & 0xff00ff)
+ + ((p0_g + (((p1_g - p0_g )*f + 0x008000)>>8)) & 0x00ff00);
+ u += dudx;
+ } while(--w);
+}
+
+int VDResamplerColStageSeparableLinear8::GetWindowSize() const {return 2;}
+void VDResamplerColStageSeparableLinear8::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src0 = (const uint8 *)srcarray[0];
+ const uint8 *src1 = (const uint8 *)srcarray[1];
+ const uint32 f = (phase >> 8) & 0xff;
+
+ do {
+ const uint32 p0 = *src0++;
+ const uint32 p1 = *src1++;
+
+ *dst++ = (uint8)(p0 + (((p1 - p0)*f + 0x80)>>8));
+ } while(--w);
+}
+
+int VDResamplerColStageSeparableLinear32::GetWindowSize() const {return 2;}
+void VDResamplerColStageSeparableLinear32::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src0 = (const uint32 *)srcarray[0];
+ const uint32 *src1 = (const uint32 *)srcarray[1];
+ const uint32 f = (phase >> 8) & 0xff;
+
+ do {
+ const uint32 p0 = *src0++;
+ const uint32 p1 = *src1++;
+
+ const uint32 p0_rb = p0 & 0xff00ff;
+ const uint32 p1_rb = p1 & 0xff00ff;
+ const uint32 p0_g = p0 & 0xff00;
+ const uint32 p1_g = p1 & 0xff00;
+
+ *dst++ = ((p0_rb + (((p1_rb - p0_rb)*f + 0x800080)>>8)) & 0xff00ff)
+ + ((p0_g + (((p1_g - p0_g )*f + 0x008000)>>8)) & 0x00ff00);
+ } while(--w);
+}
+
+VDResamplerRowStageSeparableTable8::VDResamplerRowStageSeparableTable8(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable8::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable8::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+ const unsigned ksize = (int)mFilterBank.size() >> 8;
+ const sint32 *filterBase = mFilterBank.data();
+
+ do {
+ const uint8 *src2 = src + (u>>16);
+ const sint32 *filter = filterBase + ksize*((u>>8)&0xff);
+ u += dudx;
+
+ int b = 0x2000;
+ for(unsigned i = ksize; i; --i) {
+ uint8 p = *src2++;
+ sint32 coeff = *filter++;
+
+ b += (sint32)p*coeff;
+ }
+
+ b >>= 14;
+
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ *dst++ = (uint8)b;
+ } while(--w);
+}
+
+VDResamplerRowStageSeparableTable32::VDResamplerRowStageSeparableTable32(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *src = (const uint32 *)src0;
+ const unsigned ksize = (int)mFilterBank.size() >> 8;
+ const sint32 *filterBase = mFilterBank.data();
+
+ do {
+ const uint32 *src2 = src + (u>>16);
+ const sint32 *filter = filterBase + ksize*((u>>8)&0xff);
+ u += dudx;
+
+ int r = 0x2000, g = 0x2000, b = 0x2000;
+ for(unsigned i = ksize; i; --i) {
+ uint32 p = *src2++;
+ sint32 coeff = *filter++;
+
+ r += ((p>>16)&0xff)*coeff;
+ g += ((p>> 8)&0xff)*coeff;
+ b += ((p )&0xff)*coeff;
+ }
+
+ r <<= 2;
+ g >>= 6;
+ b >>= 14;
+
+ if ((uint32)r >= 0x01000000)
+ r = ~r >> 31;
+ if ((uint32)g >= 0x00010000)
+ g = ~g >> 31;
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ *dst++ = (r & 0xff0000) + (g & 0xff00) + (b & 0xff);
+ } while(--w);
+}
+
+VDResamplerRowStageSeparableTable32Fx4::VDResamplerRowStageSeparableTable32Fx4(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32Fx4::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32Fx4::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ float *dst = (float *)dst0;
+ const float *src = (const float *)src0;
+ const unsigned ksize = (int)mFilterBank.size() >> 8;
+ const float *filterBase = mFilterBank.data();
+
+ do {
+ const float *src2 = src + (u>>16)*4;
+ const float *filter = filterBase + ksize*((u>>8)&0xff);
+ u += dudx;
+
+ float r = 0, g = 0, b = 0, a = 0;
+ for(unsigned i = ksize; i; --i) {
+ float coeff = *filter++;
+
+ r += coeff * src2[0];
+ g += coeff * src2[1];
+ b += coeff * src2[2];
+ a += coeff * src2[3];
+ src2 += 4;
+ }
+
+ dst[0] = r;
+ dst[1] = g;
+ dst[2] = b;
+ dst[3] = a;
+ dst += 4;
+ } while(--w);
+}
+
+VDResamplerRowStageSeparableTable32F::VDResamplerRowStageSeparableTable32F(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerRowStageSeparableTable32F::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerRowStageSeparableTable32F::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ float *dst = (float *)dst0;
+ const float *src = (const float *)src0;
+ const unsigned ksize = (int)mFilterBank.size() >> 8;
+ const float *filterBase = mFilterBank.data();
+
+ VDCPUCleanupExtensions();
+
+ do {
+ const float *src2 = src + (u>>16);
+ const float *filter = filterBase + ksize*((u>>8)&0xff);
+ u += dudx;
+
+ float r = 0;
+ for(unsigned i = ksize; i; --i) {
+ float coeff = *filter++;
+
+ r += coeff * src2[0];
+ ++src2;
+ }
+
+ dst[0] = r;
+ ++dst;
+ } while(--w);
+}
+
+VDResamplerColStageSeparableTable8::VDResamplerColStageSeparableTable8(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable8::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable8::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *const *src = (const uint8 *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const sint32 *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+ for(uint32 i=0; i<w; ++i) {
+ int b = 0x2000;
+ const sint32 *filter2 = filter;
+ const uint8 *const *src2 = src;
+
+ for(unsigned j = ksize; j; --j) {
+ sint32 p = (*src2++)[i];
+ sint32 coeff = *filter2++;
+
+ b += p*coeff;
+ }
+
+ b >>= 14;
+
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ *dst++ = (uint8)b;
+ }
+}
+
+VDResamplerColStageSeparableTable32::VDResamplerColStageSeparableTable32(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTable(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ uint32 *dst = (uint32 *)dst0;
+ const uint32 *const *src = (const uint32 *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const sint32 *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+ for(uint32 i=0; i<w; ++i) {
+ int r = 0x2000, g = 0x2000, b = 0x2000;
+ const sint32 *filter2 = filter;
+ const uint32 *const *src2 = src;
+
+ for(unsigned j = ksize; j; --j) {
+ uint32 p = (*src2++)[i];
+ sint32 coeff = *filter2++;
+
+ r += ((p>>16)&0xff)*coeff;
+ g += ((p>> 8)&0xff)*coeff;
+ b += ((p )&0xff)*coeff;
+ }
+
+ r <<= 2;
+ g >>= 6;
+ b >>= 14;
+
+ if ((uint32)r >= 0x01000000)
+ r = ~r >> 31;
+ if ((uint32)g >= 0x00010000)
+ g = ~g >> 31;
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ *dst++ = (r & 0xff0000) + (g & 0xff00) + (b & 0xff);
+ }
+}
+
+VDResamplerColStageSeparableTable32F::VDResamplerColStageSeparableTable32F(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32F::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32F::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ float *dst = (float *)dst0;
+ const float *const *src = (const float *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const float *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+ for(uint32 i=0; i<w; ++i) {
+ float r = 0;
+ const float *filter2 = filter;
+ const float *const *src2 = src;
+
+ for(unsigned j = ksize; j; --j) {
+ const float *p = (*src2++) + i;
+ float coeff = *filter2++;
+
+ r += p[0]*coeff;
+ }
+
+ dst[0] = r;
+ ++dst;
+ }
+}
+
+VDResamplerColStageSeparableTable32Fx4::VDResamplerColStageSeparableTable32Fx4(const IVDResamplerFilter& filter) {
+ mFilterBank.resize(filter.GetFilterWidth() * 256);
+ VDResamplerGenerateTableF(mFilterBank.data(), filter);
+}
+
+int VDResamplerColStageSeparableTable32Fx4::GetWindowSize() const {return (int)mFilterBank.size() >> 8;}
+
+void VDResamplerColStageSeparableTable32Fx4::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ float *dst = (float *)dst0;
+ const float *const *src = (const float *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const float *filter = &mFilterBank[((phase>>8)&0xff) * ksize];
+
+ for(uint32 i=0; i<w; ++i) {
+ float r = 0, g = 0, b = 0, a = 0;
+ const float *filter2 = filter;
+ const float *const *src2 = src;
+
+ for(unsigned j = ksize; j; --j) {
+ const float *p = (*src2++) + i*4;
+ float coeff = *filter2++;
+
+ r += p[0]*coeff;
+ g += p[1]*coeff;
+ b += p[2]*coeff;
+ a += p[3]*coeff;
+ }
+
+ dst[0] = r;
+ dst[1] = g;
+ dst[2] = b;
+ dst[3] = a;
+ dst += 4;
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp
new file mode 100644
index 000000000..a206d37d8
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x64.cpp
@@ -0,0 +1,26 @@
+#include "resample_stages_x64.h"
+
+extern "C" long vdasm_resize_table_col_SSE2(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w);
+extern "C" long vdasm_resize_table_row_SSE2(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+
+VDResamplerSeparableTableRowStageSSE2::VDResamplerSeparableTableRowStageSSE2(const IVDResamplerFilter& filter)
+ : VDResamplerRowStageSeparableTable32(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (uint32)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableRowStageSSE2::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_table_row_SSE2((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableColStageSSE2::VDResamplerSeparableTableColStageSSE2(const IVDResamplerFilter& filter)
+ : VDResamplerColStageSeparableTable32(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (uint32)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStageSSE2::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+ const unsigned filtSize = (unsigned)mFilterBank.size() >> 8;
+
+ vdasm_resize_table_col_SSE2((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data() + filtSize*((phase >> 8) & 0xff), filtSize, w);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp
new file mode 100644
index 000000000..bc4db574f
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/resample_stages_x86.cpp
@@ -0,0 +1,1277 @@
+#include <numeric>
+#include "blt_spanutils_x86.h"
+#include "resample_stages_x86.h"
+#include <vd2/Kasumi/resample_kernels.h>
+
+#ifdef _MSC_VER
+ #pragma warning(disable: 4799) // warning C4799: function 'vdasm_resize_table_row_8_k8_4x_MMX' has no EMMS instruction
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern "C" void vdasm_resize_table_row_8_k8_4x_SSE41(void *dst, const void *src, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_row_8_k16_4x_SSE41(void *dst, const void *src, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_row_8_SSE41(void *dst, const void *src, uint32 width, const void *kernel, uint32 kwidth);
+extern "C" void vdasm_resize_table_col_8_k2_SSE41(void *dst, const void *const *srcs, uint32 width, const void *kernel);
+extern "C" void vdasm_resize_table_col_8_k4_SSE41(void *dst, const void *const *srcs, uint32 width, const void *kernel);
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace {
+ struct ScaleInfo {
+ void *dst;
+ uintptr src;
+ uint32 accum;
+ uint32 fracinc;
+ sint32 intinc;
+ uint32 count;
+ };
+
+ extern "C" void vdasm_resize_point32(const ScaleInfo *);
+}
+
+int VDResamplerSeparablePointRowStageX86::GetWindowSize() const {return 1;}
+void VDResamplerSeparablePointRowStageX86::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ ScaleInfo info;
+
+ info.dst = (uint32 *)dst + w;
+ info.src = ((uintptr)src >> 2) + (u>>16);
+ info.accum = u<<16;
+ info.fracinc = dudx << 16;
+ info.intinc = (sint32)dudx >> 16;
+ info.count = -(sint32)w*4;
+
+ vdasm_resize_point32(&info);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf_ISSE::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *src = (const uint8 *)src0;
+
+ nsVDPixmapSpanUtils::horiz_expand2x_coaligned_ISSE(dst, src, w);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern "C" void vdasm_resize_point32_MMX(const ScaleInfo *);
+extern "C" void vdasm_resize_interp_row_run_MMX(void *dst, const void *src, uint32 width, sint64 xaccum, sint64 x_inc);
+extern "C" void vdasm_resize_interp_col_run_MMX(void *dst, const void *src1, const void *src2, uint32 width, uint32 yaccum);
+extern "C" void vdasm_resize_ccint_row_MMX(void *dst, const void *src, uint32 count, uint32 xaccum, sint32 xinc, const void *tbl);
+extern "C" void vdasm_resize_ccint_col_MMX(void *dst, const void *src1, const void *src2, const void *src3, const void *src4, uint32 count, const void *tbl);
+extern "C" long vdasm_resize_table_col_MMX(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w, long frac);
+extern "C" long vdasm_resize_table_row_MMX(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+
+int VDResamplerSeparablePointRowStageMMX::GetWindowSize() const {return 1;}
+void VDResamplerSeparablePointRowStageMMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ ScaleInfo info;
+
+ info.dst = (uint32 *)dst + w;
+ info.src = ((uintptr)src >> 2) + (u>>16);
+ info.accum = u<<16;
+ info.fracinc = dudx << 16;
+ info.intinc = (sint32)dudx >> 16;
+ info.count = -(sint32)w*4;
+
+ vdasm_resize_point32_MMX(&info);
+}
+
+int VDResamplerSeparableLinearRowStageMMX::GetWindowSize() const {return 2;}
+void VDResamplerSeparableLinearRowStageMMX::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_interp_row_run_MMX(dst0, src0, w, (sint64)u << 16, (sint64)dudx << 16);
+}
+
+int VDResamplerSeparableLinearColStageMMX::GetWindowSize() const {return 2;}
+void VDResamplerSeparableLinearColStageMMX::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+ vdasm_resize_interp_col_run_MMX(dst0, srcarray[0], srcarray[1], w, phase);
+}
+
+VDResamplerSeparableCubicRowStageMMX::VDResamplerSeparableCubicRowStageMMX(double A)
+ : mFilterBank(1024)
+{
+ sint32 *p = mFilterBank.data();
+ VDResamplerGenerateTable(p, VDResamplerCubicFilter(1.0, A));
+ VDResamplerSwizzleTable(p, 512);
+}
+
+int VDResamplerSeparableCubicRowStageMMX::GetWindowSize() const {return 4;}
+void VDResamplerSeparableCubicRowStageMMX::Process(void *dst0, const void *src0, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_ccint_row_MMX(dst0, src0, w, u, dudx, mFilterBank.data());
+}
+
+VDResamplerSeparableCubicColStageMMX::VDResamplerSeparableCubicColStageMMX(double A)
+ : mFilterBank(1024)
+{
+ sint32 *p = mFilterBank.data();
+ VDResamplerGenerateTable(p, VDResamplerCubicFilter(1.0, A));
+ VDResamplerSwizzleTable(p, 512);
+}
+
+int VDResamplerSeparableCubicColStageMMX::GetWindowSize() const {return 4;}
+void VDResamplerSeparableCubicColStageMMX::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+ vdasm_resize_ccint_col_MMX(dst0, srcarray[0], srcarray[1], srcarray[2], srcarray[3], w, mFilterBank.data() + ((phase>>6)&0x3fc));
+}
+
+VDResamplerSeparableTableRowStage8MMX::VDResamplerSeparableTableRowStage8MMX(const IVDResamplerFilter& filter)
+ : VDResamplerRowStageSeparableTable32(filter)
+ , mLastSrcWidth(0)
+ , mLastDstWidth(0)
+ , mLastU(0)
+ , mLastDUDX(0)
+{
+ mAlignedKernelWidth = (GetWindowSize() + 6) & ~3;
+ mAlignedKernelSize = mAlignedKernelWidth + 4;
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Init(const VDResamplerAxis& axis, uint32 srcw) {
+ uint32 w = axis.dx_preclip + axis.dx_active + axis.dx_postclip + axis.dx_dualclip;
+
+ if (mLastSrcWidth != srcw || mLastDstWidth != w || mLastU != axis.u || mLastDUDX != axis.dudx) {
+ mLastSrcWidth = srcw;
+ mLastDstWidth = w;
+ mLastU = axis.u;
+ mLastDUDX = axis.dudx;
+
+ RedoRowFilters(axis, w, srcw);
+ }
+}
+
+void VDResamplerSeparableTableRowStage8MMX::RedoRowFilters(const VDResamplerAxis& axis, uint32 w, uint32 srcw) {
+ int kstride = mFilterBank.size() >> 8;
+ int ksize = mAlignedKernelWidth;
+ int kesize = mAlignedKernelSize;
+
+ mRowKernels.clear();
+ mRowKernelSize = w * kesize;
+
+ mRowKernels.resize(mRowKernelSize * 4, 0);
+
+ for(int byteOffset = 0; byteOffset < 4; ++byteOffset) {
+ sint16 *dst = mRowKernels.data() + mRowKernelSize * byteOffset;
+ int ksizeThisOffset = std::min<int>(ksize, (byteOffset + srcw + 3) & ~3);
+
+ mKernelSizeByOffset[byteOffset] = ksizeThisOffset;
+
+ sint32 u = axis.u;
+ sint32 uoffmin = -byteOffset;
+ sint32 uoffmax = ((srcw + byteOffset + 3) & ~3) - byteOffset - ksizeThisOffset;
+ for(uint32 i=0; i<w; ++i) {
+ sint32 uoffset = u >> 16;
+ sint32 uoffset2 = ((uoffset + byteOffset) & ~3) - byteOffset;
+
+ if (uoffset2 < uoffmin)
+ uoffset2 = uoffmin;
+
+ if (uoffset2 > uoffmax)
+ uoffset2 = uoffmax;
+
+ VDASSERT(uoffset2 + ksizeThisOffset <= (((sint32)srcw + byteOffset + 3) & ~3));
+
+ *(sint32 *)dst = uoffset2;
+ dst += 2;
+ *dst++ = 0;
+ *dst++ = 0;
+
+ uint32 phase = (u >> 8) & 255;
+ const sint32 *src = &mFilterBank[kstride * phase];
+
+ sint32 start = 0;
+ sint32 end = kstride;
+
+ int dstoffset = uoffset - uoffset2;
+
+ // check for filter kernel overlapping left source boundary
+ if (uoffset < 0)
+ start = -uoffset;
+
+ // check for filter kernel overlapping right source boundary
+ if (uoffset + end > (sint32)srcw)
+ end = srcw - uoffset;
+
+ VDASSERT(dstoffset + start >= 0);
+ VDASSERT(dstoffset + end <= ksizeThisOffset);
+
+ sint16 *dst2 = dst + dstoffset;
+ dst += ksizeThisOffset;
+
+ for(int j=start; j<end; ++j)
+ dst2[j] = src[j];
+
+ if (start > 0)
+ dst2[start] = std::accumulate(src, src+start, dst2[start]);
+
+ if (end < kstride)
+ dst2[end - 1] = std::accumulate(src+end, src+kstride, dst2[end - 1]);
+
+ u += axis.dudx;
+ }
+ }
+
+ // swizzle rows where optimization is possible
+ vdfastvector<sint16> temp;
+
+ int quads = w >> 2;
+ int quadRemainder = w & 3;
+
+ for(int byteOffset = 0; byteOffset < 4; ++byteOffset) {
+ int ksizeThisOffset = mKernelSizeByOffset[byteOffset];
+ int kpairs = ksizeThisOffset >> 2;
+
+ if (ksizeThisOffset < 8 || ksizeThisOffset > 12) {
+ mbQuadOptimizationEnabled[byteOffset] = false;
+ } else {
+ ptrdiff_t unswizzledStride = (ksizeThisOffset >> 1) + 2;
+
+ mbQuadOptimizationEnabled[byteOffset] = true;
+ mTailOffset[byteOffset] = quads * (8 + ksizeThisOffset*4);
+
+ uint32 *dst = (uint32 *)&mRowKernels[mRowKernelSize * byteOffset];
+ temp.resize(mRowKernelSize);
+ memcpy(temp.data(), dst, mRowKernelSize*2);
+
+ const uint32 *src0 = (const uint32 *)temp.data();
+ const uint32 *src1 = src0 + unswizzledStride;
+ const uint32 *src2 = src1 + unswizzledStride;
+ const uint32 *src3 = src2 + unswizzledStride;
+ ptrdiff_t srcskip = unswizzledStride * 3;
+
+ for(int q = 0; q < quads; ++q) {
+ dst[0] = src0[0];
+ dst[1] = src1[0];
+ dst[2] = src2[0];
+ dst[3] = src3[0];
+ src0 += 2;
+ src1 += 2;
+ src2 += 2;
+ src3 += 2;
+ dst += 4;
+
+ for(int p = 0; p < kpairs; ++p) {
+ dst[0] = src0[0];
+ dst[1] = src0[1];
+ dst[2] = src1[0];
+ dst[3] = src1[1];
+ dst[4] = src2[0];
+ dst[5] = src2[1];
+ dst[6] = src3[0];
+ dst[7] = src3[1];
+ dst += 8;
+ src0 += 2;
+ src1 += 2;
+ src2 += 2;
+ src3 += 2;
+ }
+
+ src0 += srcskip;
+ src1 += srcskip;
+ src2 += srcskip;
+ src3 += srcskip;
+ }
+
+ memcpy(dst, src0, unswizzledStride * 4 * quadRemainder);
+
+ VDASSERT(dst + unswizzledStride * quadRemainder <= (void *)(mRowKernels.data() + (mRowKernelSize * (byteOffset + 1))));
+ }
+ }
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_k8_4x_MMX(void *dst, const void *src, uint32 width, const void *kernel) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000000000002000;
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov ebp, [esp + 4 + 16] ;ebp = dst
+ mov esi, [esp + 12 + 16] ;esi = width
+ mov edi, [esp + 16 + 16] ;edi = kernel
+yloop:
+ ;eax = temp
+ ;ebx = temp
+ ;ecx = temp
+ ;edx = temp
+ ;esi = horiz counter
+ ;edi = filter list
+ ;ebp = destination
+
+ mov eax, [edi+0]
+ mov ebx, [edi+4]
+ mov ecx, [edi+8]
+ mov edx, [esp+8+16]
+ add eax, edx
+ add ebx, edx
+ add ecx, edx
+ add edx, [edi+12]
+
+ movd mm0, [eax]
+ punpcklbw mm0, mm7
+
+ pmaddwd mm0, [edi+16]
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+
+ pmaddwd mm1, [edi+24]
+ movd mm2, [ecx]
+ punpcklbw mm2, mm7
+
+ pmaddwd mm2, [edi+32]
+ movd mm3, [edx]
+ punpcklbw mm3, mm7
+
+ pmaddwd mm3, [edi+40]
+ movd mm4, [eax+4]
+ paddd mm0, mm6
+
+ movd mm5, [ebx+4]
+ punpcklbw mm4, mm7
+ paddd mm1, mm6
+
+ pmaddwd mm4, [edi+48]
+ punpcklbw mm5, mm7
+ paddd mm2, mm6
+
+ pmaddwd mm5, [edi+56]
+ paddd mm3, mm6
+ paddd mm0, mm4
+
+ paddd mm1, mm5
+ movd mm4, [ecx+4]
+ punpcklbw mm4, mm7
+
+ pmaddwd mm4, [edi+64]
+ movd mm5, [edx+4]
+ punpcklbw mm5, mm7
+
+ pmaddwd mm5, [edi+72]
+ paddd mm2, mm4
+ paddd mm3, mm5
+
+ movq mm4, mm0
+ punpckldq mm0, mm1
+ movq mm5, mm2
+ punpckldq mm2, mm3
+ punpckhdq mm4, mm1
+ punpckhdq mm5, mm3
+ paddd mm0, mm4
+ paddd mm2, mm5
+ psrad mm0, 14
+ psrad mm2, 14
+
+ packssdw mm0, mm2
+ packuswb mm0, mm0
+
+ add edi, 80
+
+ movd [ebp], mm0
+ add ebp, 4
+ sub esi, 1
+ jne yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_k12_4x_MMX(void *dst, const void *src, uint32 width, const void *kernel) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov ebp, [esp + 4 + 16] ;ebp = dst
+ mov esi, [esp + 12 + 16] ;esi = width
+ mov edi, [esp + 16 + 16] ;edi = kernel
+yloop:
+ ;eax = temp
+ ;ebx = temp
+ ;ecx = temp
+ ;edx = temp
+ ;esi = horiz counter
+ ;edi = filter list
+ ;ebp = destination
+
+ mov eax, [edi+0]
+ mov ebx, [edi+4]
+ mov ecx, [edi+8]
+ mov edx, [esp+8+16]
+ add eax, edx
+ add ebx, edx
+ add ecx, edx
+ add edx, [edi+12]
+
+ movd mm0, [eax]
+ punpcklbw mm0, mm7
+
+ pmaddwd mm0, [edi+16]
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+
+ pmaddwd mm1, [edi+24]
+ movd mm2, [ecx]
+ punpcklbw mm2, mm7
+
+ pmaddwd mm2, [edi+32]
+ movd mm3, [edx]
+ punpcklbw mm3, mm7
+
+ pmaddwd mm3, [edi+40]
+ movd mm4, [eax+4]
+ punpcklbw mm4, mm7
+
+ pmaddwd mm4, [edi+48]
+ movd mm5, [ebx+4]
+ punpcklbw mm5, mm7
+
+ pmaddwd mm5, [edi+56]
+ paddd mm0, mm4
+ paddd mm1, mm5
+
+ movd mm4, [ecx+4]
+ punpcklbw mm4, mm7
+ movd mm5, [edx+4]
+
+ pmaddwd mm4, [edi+64]
+ punpcklbw mm5, mm7
+ paddd mm2, mm4
+
+ pmaddwd mm5, [edi+72]
+ movd mm4, [eax+8]
+ punpcklbw mm4, mm7
+
+ paddd mm3, mm5
+ movd mm5, [ebx+8]
+ punpcklbw mm5, mm7
+
+ pmaddwd mm4, [edi+80]
+ paddd mm0, mm4
+ movd mm4, [ecx+8]
+
+ pmaddwd mm5, [edi+88]
+ paddd mm1, mm5
+ punpcklbw mm4, mm7
+
+ pmaddwd mm4, [edi+96]
+ movd mm5, [edx+8]
+ punpcklbw mm5, mm7
+
+ pmaddwd mm5, [edi+104]
+ paddd mm2, mm4
+ paddd mm3, mm5
+
+ movq mm4, mm0
+ punpckldq mm0, mm1
+ movq mm5, mm2
+ punpckldq mm2, mm3
+ punpckhdq mm4, mm1
+ punpckhdq mm5, mm3
+ paddd mm0, mm4
+ paddd mm2, mm5
+ paddd mm0, mm6
+ paddd mm2, mm6
+ psrad mm0, 14
+ psrad mm2, 14
+
+ packssdw mm0, mm2
+ packuswb mm0, mm0
+
+ add edi, 112
+
+ movd [ebp], mm0
+ add ebp, 4
+ sub esi, 1
+ jne yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void __declspec(naked) vdasm_resize_table_row_8_MMX(void *dst, const void *src, uint32 width, const void *kernel, uint32 kwidth) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000000000002000;
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov edi, [esp + 4 + 16] ;edi = dst
+ mov ebx, [esp + 8 + 16] ;ebx = src
+ mov ebp, [esp + 12 + 16] ;ebp = width
+ mov edx, [esp + 16 + 16] ;edx = kernel
+yloop:
+ ;eax = temp
+ ;ebx = source base address
+ ;ecx = (temp) source
+ ;edx = filter list
+ ;esi = (temp) kernel width
+ ;edi = destination
+ ;ebp = horiz counter
+
+ mov eax, [edx]
+ add edx, 8
+ lea ecx, [ebx + eax]
+ mov esi, [esp + 20 + 16] ;esi = kernel width
+
+ movq mm2, mm6
+xloop:
+ movd mm0, [ecx]
+ punpcklbw mm0, mm7
+ add ecx, 4
+ pmaddwd mm0, [edx]
+ paddd mm2, mm0
+ add edx, 8
+ sub esi, 4
+ jne xloop
+
+ punpckldq mm0, mm2
+ paddd mm0, mm2
+ psrad mm0, 14
+ psrlq mm0, 32
+ packssdw mm0, mm0
+ packuswb mm0, mm0
+ movd eax, mm0
+ mov [edi], al
+ add edi, 1
+ sub ebp, 1
+ jne yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Process(void *dst, const void *src, uint32 w) {
+ int byteOffset = (int)(ptrdiff_t)src & 3;
+ const sint16 *ksrc = &mRowKernels[mRowKernelSize * byteOffset];
+#if 0
+ int kwidth = mAlignedKernelWidth;
+ uint8 *dst2 = (uint8 *)dst;
+
+ do {
+ int offset = ksrc[0];
+ ksrc += 4;
+
+ const uint8 *src2 = (const uint8 *)src + offset;
+ sint32 accum = 0x8000;
+ for(int i=0; i<kwidth; ++i) {
+ accum += (sint32)src2[i] * (*ksrc++);
+ }
+
+ accum >>= 14;
+
+ accum &= ~(accum >> 31);
+ accum |= (255 - accum) >> 31;
+
+ *dst2++ = (uint8)accum;
+
+ } while(--w);
+#else
+ int ksize = mKernelSizeByOffset[byteOffset];
+ if (mbQuadOptimizationEnabled[byteOffset]) {
+ if (w >= 4) {
+ if (ksize == 12) {
+ vdasm_resize_table_row_8_k12_4x_MMX(dst, src, w >> 2, ksrc);
+
+#if 0
+ int w4 = w >> 2;
+ uint8 *dst2 = (uint8 *)dst;
+ const uint8 *src2 = (const uint8 *)src;
+ const sint16 *ksrc2 = ksrc;
+
+ do {
+ int off0 = ksrc2[0];
+ int off1 = ksrc2[2];
+ int off2 = ksrc2[4];
+ int off3 = ksrc2[6];
+ const uint8 *d0 = src2 + off0;
+ const uint8 *d1 = src2 + off1;
+ const uint8 *d2 = src2 + off2;
+ const uint8 *d3 = src2 + off3;
+
+ int acc0 = 0;
+ int acc1 = 0;
+ int acc2 = 0;
+ int acc3 = 0;
+
+ acc0 += d0[ 0]*ksrc2[ 8]
+ + d0[ 1]*ksrc2[ 9]
+ + d0[ 2]*ksrc2[ 10]
+ + d0[ 3]*ksrc2[ 11]
+ + d0[ 4]*ksrc2[ 24]
+ + d0[ 5]*ksrc2[ 25]
+ + d0[ 6]*ksrc2[ 26]
+ + d0[ 7]*ksrc2[ 27]
+ + d0[ 8]*ksrc2[ 40]
+ + d0[ 9]*ksrc2[ 41]
+ + d0[10]*ksrc2[ 42]
+ + d0[11]*ksrc2[ 43];
+
+ acc0 = (acc0 + 0x2000) >> 14;
+ if (acc0 < 0) acc0 = 0; else if (acc0 > 255) acc0 = 255;
+
+ acc1 += d1[ 0]*ksrc2[ 12]
+ + d1[ 1]*ksrc2[ 13]
+ + d1[ 2]*ksrc2[ 14]
+ + d1[ 3]*ksrc2[ 15]
+ + d1[ 4]*ksrc2[ 28]
+ + d1[ 5]*ksrc2[ 29]
+ + d1[ 6]*ksrc2[ 30]
+ + d1[ 7]*ksrc2[ 31]
+ + d1[ 8]*ksrc2[ 44]
+ + d1[ 9]*ksrc2[ 45]
+ + d1[10]*ksrc2[ 46]
+ + d1[11]*ksrc2[ 47];
+
+ acc1 = (acc1 + 0x2000) >> 14;
+ if (acc1 < 0) acc1 = 0; else if (acc1 > 255) acc1 = 255;
+
+ acc2 += d2[ 0]*ksrc2[ 16]
+ + d2[ 1]*ksrc2[ 17]
+ + d2[ 2]*ksrc2[ 18]
+ + d2[ 3]*ksrc2[ 19]
+ + d2[ 4]*ksrc2[ 32]
+ + d2[ 5]*ksrc2[ 33]
+ + d2[ 6]*ksrc2[ 34]
+ + d2[ 7]*ksrc2[ 35]
+ + d2[ 8]*ksrc2[ 48]
+ + d2[ 9]*ksrc2[ 49]
+ + d2[10]*ksrc2[ 50]
+ + d2[11]*ksrc2[ 51];
+
+ acc2 = (acc2 + 0x2000) >> 14;
+ if (acc2 < 0) acc2 = 0; else if (acc2 > 255) acc2 = 255;
+
+ acc3 += d3[ 0]*ksrc2[ 20]
+ + d3[ 1]*ksrc2[ 21]
+ + d3[ 2]*ksrc2[ 22]
+ + d3[ 3]*ksrc2[ 23]
+ + d3[ 4]*ksrc2[ 36]
+ + d3[ 5]*ksrc2[ 37]
+ + d3[ 6]*ksrc2[ 38]
+ + d3[ 7]*ksrc2[ 39]
+ + d3[ 8]*ksrc2[ 52]
+ + d3[ 9]*ksrc2[ 53]
+ + d3[10]*ksrc2[ 54]
+ + d3[11]*ksrc2[ 55];
+
+ acc3 = (acc3 + 0x2000) >> 14;
+ if (acc3 < 0) acc3 = 0; else if (acc3 > 255) acc3 = 255;
+
+ ksrc2 += 56;
+
+ dst2[0] = (uint8)acc0;
+ dst2[1] = (uint8)acc1;
+ dst2[2] = (uint8)acc2;
+ dst2[3] = (uint8)acc3;
+ dst2 += 4;
+ } while(--w4);
+#endif
+ } else
+ vdasm_resize_table_row_8_k8_4x_MMX(dst, src, w >> 2, ksrc);
+ }
+
+ if (w & 3)
+ vdasm_resize_table_row_8_MMX((char *)dst + (w & ~3), src, w & 3, ksrc + mTailOffset[byteOffset], ksize);
+ } else {
+ vdasm_resize_table_row_8_MMX(dst, src, w, ksrc, ksize);
+ }
+#endif
+}
+
+void VDResamplerSeparableTableRowStage8MMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableRowStageMMX::VDResamplerSeparableTableRowStageMMX(const IVDResamplerFilter& filter)
+ : VDResamplerRowStageSeparableTable32(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableRowStageMMX::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStage8MMX::VDResamplerSeparableTableColStage8MMX(const IVDResamplerFilter& filter)
+ : VDResamplerColStageSeparableTable8(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_k2_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov esi, [esp + 4 + 16] ;esi = dst
+ mov edi, [esp + 16 + 16] ;edi = kernel
+ mov ebp, [esp + 12 + 16] ;ebp = width
+
+ movq mm5, [edi]
+
+ mov edx, [esp + 8 + 16] ;ebx = srcs
+ mov eax, [edx+0]
+ mov ebx, [edx+4]
+ add eax, ebp
+ add ebx, ebp
+ neg ebp
+yloop:
+ ;eax = row0
+ ;ebx = row1
+ ;ecx =
+ ;edx =
+ ;edi = kernel
+ ;esi = dest
+ ;ebp = width counter
+
+ movd mm0, [eax+ebp]
+ punpcklbw mm0, mm7
+ movd mm2, [ebx+ebp]
+ punpcklbw mm2, mm7
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+ pmaddwd mm0, mm5
+ pmaddwd mm1, mm5
+
+ paddd mm0, mm6
+ paddd mm1, mm6
+
+ psrad mm0, 14
+ psrad mm1, 14
+ packssdw mm0, mm1
+ packuswb mm0, mm0
+ movd [esi], mm0
+ add esi, 4
+ add ebp, 4
+ jne yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_k4_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov esi, [esp + 4 + 16] ;esi = dst
+ mov edi, [esp + 16 + 16] ;edi = kernel
+ xor ebp, ebp
+
+ mov edx, [esp + 8 + 16] ;ebx = srcs
+ mov eax, [edx+0]
+ mov ebx, [edx+4]
+ mov ecx, [edx+8]
+ mov edx, [edx+12]
+yloop:
+ ;eax = row0
+ ;ebx = row1
+ ;ecx = row2
+ ;edx = row3
+ ;edi = kernel
+ ;esi = dest
+ ;ebp = width counter
+
+ movd mm0, [eax+ebp]
+ punpcklbw mm0, mm7
+ movd mm2, [ebx+ebp]
+ punpcklbw mm2, mm7
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ movq mm5, [edi]
+ punpckhwd mm1, mm2
+ pmaddwd mm0, mm5
+ pmaddwd mm1, mm5
+
+ paddd mm0, mm6
+ paddd mm1, mm6
+
+ movd mm3, [ecx+ebp]
+ punpcklbw mm3, mm7
+ movd mm2, [edx+ebp]
+ punpcklbw mm2, mm7
+ movq mm4, mm3
+ punpcklwd mm3, mm2
+ movq mm5, [edi+8]
+ punpckhwd mm4, mm2
+ pmaddwd mm3, mm5
+ pmaddwd mm4, mm5
+
+ paddd mm0, mm3
+ paddd mm1, mm4
+
+ psrad mm0, 14
+ psrad mm1, 14
+ packssdw mm0, mm1
+ packuswb mm0, mm0
+ add ebp, 4
+ movd [esi], mm0
+ add esi, 4
+ cmp ebp, [esp + 12 + 16]
+ jb yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void __declspec(naked) vdasm_resize_table_col_8_MMX(void *dst, const void *const *srcs, uint32 width, const void *kernel, uint32 kwidth) {
+ static const __declspec(align(8)) __int64 kRound = 0x0000200000002000;
+
+ __asm {
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+ pxor mm7, mm7
+ movq mm6, kRound
+
+ mov edi, [esp + 4 + 16] ;edi = dst
+ xor ebp, ebp
+yloop:
+ mov edx, [esp + 16 + 16] ;edx = kernel
+ mov ebx, [esp + 8 + 16] ;ebx = srcs
+ mov esi, [esp + 20 + 16] ;esi = kwidth
+ movq mm3, mm6
+ movq mm4, mm6
+xloop:
+ mov ecx, [ebx]
+ movd mm0, [ecx+ebp]
+ punpcklbw mm0, mm7
+ mov ecx, [ebx+4]
+ movd mm2, [ecx+ebp]
+ punpcklbw mm2, mm7
+ movq mm1, mm0
+ punpcklwd mm0, mm2
+ punpckhwd mm1, mm2
+ movq mm5, [edx]
+ pmaddwd mm0, mm5
+ pmaddwd mm1, mm5
+
+ paddd mm3, mm0
+ paddd mm4, mm1
+ add ebx, 8
+ add edx, 8
+ sub esi, 2
+ jne xloop
+
+ psrad mm3, 14
+ psrad mm4, 14
+ packssdw mm3, mm4
+ packuswb mm3, mm3
+ movd [edi], mm3
+ add edi, 4
+ add ebp, 4
+ cmp ebp, [esp + 12 + 16]
+ jb yloop
+
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+ }
+}
+
+void VDResamplerSeparableTableColStage8MMX::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *const *src = (const uint8 *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const sint16 *filter = (const sint16 *)&mFilterBank[((phase>>8)&0xff) * ksize];
+
+ int w4 = w & ~3;
+
+ if (w4) {
+ switch(ksize) {
+ case 2:
+ vdasm_resize_table_col_8_k2_MMX(dst, (const void *const *)src, w4, filter);
+ break;
+
+ case 4:
+ vdasm_resize_table_col_8_k4_MMX(dst, (const void *const *)src, w4, filter);
+ break;
+
+ default:
+ vdasm_resize_table_col_8_MMX(dst, (const void *const *)src, w4, filter, ksize);
+ break;
+ }
+ }
+
+ for(uint32 i=w4; i<w; ++i) {
+ int b = 0x2000;
+ const sint16 *filter2 = filter;
+ const uint8 *const *src2 = src;
+
+ for(unsigned j = ksize; j; j -= 2) {
+ sint32 p0 = (*src2++)[i];
+ sint32 p1 = (*src2++)[i];
+ sint32 coeff0 = filter2[0];
+ sint32 coeff1 = filter2[1];
+ filter2 += 4;
+
+ b += p0*coeff0;
+ b += p1*coeff1;
+ }
+
+ b >>= 14;
+
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ dst[i] = (uint8)b;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStageMMX::VDResamplerSeparableTableColStageMMX(const IVDResamplerFilter& filter)
+ : VDResamplerColStageSeparableTable32(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStageMMX::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+ vdasm_resize_table_col_MMX((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, (phase >> 8) & 0xff);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// resampler stages (SSE2, x86)
+//
+///////////////////////////////////////////////////////////////////////////
+
+extern "C" long vdasm_resize_table_col_SSE2(uint32 *out, const uint32 *const*in_table, const int *filter, int filter_width, uint32 w, long frac);
+extern "C" long vdasm_resize_table_row_SSE2(uint32 *out, const uint32 *in, const int *filter, int filter_width, uint32 w, long accum, long frac);
+extern "C" void vdasm_resize_ccint_col_SSE2(void *dst, const void *src1, const void *src2, const void *src3, const void *src4, uint32 count, const void *tbl);
+
+VDResamplerSeparableCubicColStageSSE2::VDResamplerSeparableCubicColStageSSE2(double A)
+ : VDResamplerSeparableCubicColStageMMX(A)
+{
+}
+
+void VDResamplerSeparableCubicColStageSSE2::Process(void *dst0, const void *const *srcarray, uint32 w, sint32 phase) {
+ vdasm_resize_ccint_col_SSE2(dst0, srcarray[0], srcarray[1], srcarray[2], srcarray[3], w, mFilterBank.data() + ((phase>>6)&0x3fc));
+}
+
+VDResamplerSeparableTableRowStageSSE2::VDResamplerSeparableTableRowStageSSE2(const IVDResamplerFilter& filter)
+ : VDResamplerSeparableTableRowStageMMX(filter)
+{
+}
+
+void VDResamplerSeparableTableRowStageSSE2::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+VDResamplerSeparableTableColStageSSE2::VDResamplerSeparableTableColStageSSE2(const IVDResamplerFilter& filter)
+ : VDResamplerSeparableTableColStageMMX(filter)
+{
+}
+
+void VDResamplerSeparableTableColStageSSE2::Process(void *dst, const void *const *src, uint32 w, sint32 phase) {
+ vdasm_resize_table_col_SSE2((uint32*)dst, (const uint32 *const *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, (phase >> 8) & 0xff);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// resampler stages (SSE4.1, x86)
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableRowStage8SSE41::VDResamplerSeparableTableRowStage8SSE41(const IVDResamplerFilter& filter)
+ : VDResamplerRowStageSeparableTable32(filter)
+ , mLastSrcWidth(0)
+ , mLastDstWidth(0)
+ , mLastU(0)
+ , mLastDUDX(0)
+{
+ mAlignedKernelWidth = (GetWindowSize() + 15) & ~7;
+ mAlignedKernelSize = mAlignedKernelWidth + 16;
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Init(const VDResamplerAxis& axis, uint32 srcw) {
+ uint32 w = axis.dx_preclip + axis.dx_active + axis.dx_postclip + axis.dx_dualclip;
+
+ if (mLastSrcWidth != srcw || mLastDstWidth != w || mLastU != axis.u || mLastDUDX != axis.dudx) {
+ mLastSrcWidth = srcw;
+ mLastDstWidth = w;
+ mLastU = axis.u;
+ mLastDUDX = axis.dudx;
+
+ RedoRowFilters(axis, w, srcw);
+ }
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::RedoRowFilters(const VDResamplerAxis& axis, uint32 w, uint32 srcw) {
+ int kstride = mFilterBank.size() >> 8;
+ int ksize = mAlignedKernelWidth;
+ int kesize = mAlignedKernelSize;
+
+ mRowKernels.clear();
+ mRowKernelSize = w * kesize;
+
+ mRowKernels.resize(mRowKernelSize * 8, 0);
+
+ for(int byteOffset = 0; byteOffset < 8; ++byteOffset) {
+ sint16 *dst = mRowKernels.data() + mRowKernelSize * byteOffset;
+ int ksizeThisOffset = std::min<int>(ksize, (byteOffset + srcw + 7) & ~7);
+
+ mKernelSizeByOffset[byteOffset] = ksizeThisOffset;
+
+ sint32 u = axis.u;
+ sint32 uoffmin = -byteOffset;
+ sint32 uoffmax = ((srcw + byteOffset + 7) & ~7) - byteOffset - ksizeThisOffset;
+ for(uint32 i=0; i<w; ++i) {
+ sint32 uoffset = u >> 16;
+ sint32 uoffset2 = ((uoffset + byteOffset) & ~7) - byteOffset;
+
+ if (uoffset2 < uoffmin)
+ uoffset2 = uoffmin;
+
+ if (uoffset2 > uoffmax)
+ uoffset2 = uoffmax;
+
+ *(sint32 *)dst = uoffset2;
+ dst += 2;
+ *dst++ = 0;
+ *dst++ = 0;
+ *dst++ = 0;
+ *dst++ = 0;
+ *dst++ = 0;
+ *dst++ = 0;
+
+ uint32 phase = (u >> 8) & 255;
+ const sint32 *src = &mFilterBank[kstride * phase];
+
+ sint32 start = 0;
+ sint32 end = kstride;
+
+ int dstoffset = uoffset - uoffset2;
+
+ // check for filter kernel overlapping left source boundary
+ if (uoffset < 0)
+ start = -uoffset;
+
+ // check for filter kernel overlapping right source boundary
+ if (uoffset + end > (sint32)srcw)
+ end = srcw - uoffset;
+
+ VDASSERT(dstoffset + start >= 0);
+ VDASSERT(dstoffset + end <= ksizeThisOffset);
+
+ sint16 *dst2 = dst + dstoffset;
+ dst += ksizeThisOffset;
+
+ for(int j=start; j<end; ++j)
+ dst2[j] = src[j];
+
+ if (start > 0)
+ dst2[start] = std::accumulate(src, src+start, dst2[start]);
+
+ if (end < kstride)
+ dst2[end - 1] = std::accumulate(src+end, src+kstride, dst2[end - 1]);
+
+ u += axis.dudx;
+ }
+ }
+
+ // swizzle rows where optimization is possible
+ vdfastvector<sint16> temp;
+
+ int quads = w >> 2;
+ int quadRemainder = w & 3;
+
+ for(int byteOffset = 0; byteOffset < 8; ++byteOffset) {
+ int ksizeThisOffset = mKernelSizeByOffset[byteOffset];
+ int kpairs = ksizeThisOffset >> 3;
+
+ if (ksizeThisOffset < 8 || ksizeThisOffset > 16) {
+ mbQuadOptimizationEnabled[byteOffset] = false;
+ } else {
+ ptrdiff_t unswizzledStride = (ksizeThisOffset >> 1) + 4;
+
+ mbQuadOptimizationEnabled[byteOffset] = true;
+ mTailOffset[byteOffset] = quads * (8 + ksizeThisOffset*4);
+
+ uint32 *dst = (uint32 *)&mRowKernels[mRowKernelSize * byteOffset];
+ temp.resize(mRowKernelSize);
+ memcpy(temp.data(), dst, mRowKernelSize*2);
+
+ const uint32 *src0 = (const uint32 *)temp.data();
+ const uint32 *src1 = src0 + unswizzledStride;
+ const uint32 *src2 = src1 + unswizzledStride;
+ const uint32 *src3 = src2 + unswizzledStride;
+ ptrdiff_t srcskip = unswizzledStride * 3;
+
+ for(int q = 0; q < quads; ++q) {
+ dst[0] = src0[0];
+ dst[1] = src1[0];
+ dst[2] = src2[0];
+ dst[3] = src3[0];
+ src0 += 4;
+ src1 += 4;
+ src2 += 4;
+ src3 += 4;
+ dst += 4;
+
+ for(int p = 0; p < kpairs; ++p) {
+ dst[ 0] = src0[0];
+ dst[ 1] = src0[1];
+ dst[ 2] = src0[2];
+ dst[ 3] = src0[3];
+ dst[ 4] = src1[0];
+ dst[ 5] = src1[1];
+ dst[ 6] = src1[2];
+ dst[ 7] = src1[3];
+ dst[ 8] = src2[0];
+ dst[ 9] = src2[1];
+ dst[10] = src2[2];
+ dst[11] = src2[3];
+ dst[12] = src3[0];
+ dst[13] = src3[1];
+ dst[14] = src3[2];
+ dst[15] = src3[3];
+ dst += 16;
+ src0 += 4;
+ src1 += 4;
+ src2 += 4;
+ src3 += 4;
+ }
+
+ src0 += srcskip;
+ src1 += srcskip;
+ src2 += srcskip;
+ src3 += srcskip;
+ }
+
+ memcpy(dst, src0, unswizzledStride * 4 * quadRemainder);
+ }
+ }
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Process(void *dst, const void *src, uint32 w) {
+ int byteOffset = (int)(ptrdiff_t)src & 7;
+ const sint16 *ksrc = &mRowKernels[mRowKernelSize * byteOffset];
+
+ int ksize = mKernelSizeByOffset[byteOffset];
+ if (mbQuadOptimizationEnabled[byteOffset]) {
+ if (w >= 4) {
+ if (ksize == 16)
+ vdasm_resize_table_row_8_k16_4x_SSE41(dst, src, w >> 2, ksrc);
+ else
+ vdasm_resize_table_row_8_k8_4x_SSE41(dst, src, w >> 2, ksrc);
+ }
+
+ if (w & 3)
+ vdasm_resize_table_row_8_SSE41((char *)dst + (w & ~3), src, w & 3, ksrc + mTailOffset[byteOffset], ksize);
+ } else {
+ vdasm_resize_table_row_8_SSE41(dst, src, w, ksrc, ksize);
+ }
+}
+
+void VDResamplerSeparableTableRowStage8SSE41::Process(void *dst, const void *src, uint32 w, uint32 u, uint32 dudx) {
+ vdasm_resize_table_row_MMX((uint32 *)dst, (const uint32 *)src, (const int *)mFilterBank.data(), (int)mFilterBank.size() >> 8, w, u, dudx);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDResamplerSeparableTableColStage8SSE41::VDResamplerSeparableTableColStage8SSE41(const IVDResamplerFilter& filter)
+ : VDResamplerColStageSeparableTable8(filter)
+{
+ VDResamplerSwizzleTable(mFilterBank.data(), (unsigned)mFilterBank.size() >> 1);
+}
+
+void VDResamplerSeparableTableColStage8SSE41::Process(void *dst0, const void *const *src0, uint32 w, sint32 phase) {
+ uint8 *dst = (uint8 *)dst0;
+ const uint8 *const *src = (const uint8 *const *)src0;
+ const unsigned ksize = (unsigned)mFilterBank.size() >> 8;
+ const sint16 *filter = (const sint16 *)&mFilterBank[((phase>>8)&0xff) * ksize];
+
+ int w4 = w & ~3;
+
+ if (w4) {
+ switch(ksize) {
+ case 2:
+ vdasm_resize_table_col_8_k2_SSE41(dst, (const void *const *)src, w4, filter);
+ break;
+
+ case 4:
+ vdasm_resize_table_col_8_k4_SSE41(dst, (const void *const *)src, w4, filter);
+ break;
+
+ default:
+ vdasm_resize_table_col_8_MMX(dst, (const void *const *)src, w4, filter, ksize);
+ break;
+ }
+ }
+
+ for(uint32 i=w4; i<w; ++i) {
+ int b = 0x2000;
+ const sint16 *filter2 = filter;
+ const uint8 *const *src2 = src;
+
+ for(unsigned j = ksize; j; j -= 2) {
+ sint32 p0 = (*src2++)[i];
+ sint32 p1 = (*src2++)[i];
+ sint32 coeff0 = filter2[0];
+ sint32 coeff1 = filter2[1];
+ filter2 += 4;
+
+ b += p0*coeff0;
+ b += p1*coeff1;
+ }
+
+ b >>= 14;
+
+ if ((uint32)b >= 0x00000100)
+ b = ~b >> 31;
+
+ dst[i] = (uint8)b;
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp b/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp
new file mode 100644
index 000000000..3afdec910
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/stretchblt_reference.cpp
@@ -0,0 +1,816 @@
+#include <vd2/system/memory.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+
+namespace {
+ struct VDPixmapReferenceStretchBltParameters {
+ void *dst;
+ ptrdiff_t dstpitch;
+ const void *src;
+ ptrdiff_t srcpitch;
+ ptrdiff_t srcinc;
+ sint32 dx;
+ sint32 dy;
+ uint32 u;
+ uint32 uinc;
+ uint32 dudx;
+ uint32 v;
+ uint32 dvdy;
+ sint32 xprecopy;
+ sint32 xpostcopy;
+ ptrdiff_t xprepos;
+ ptrdiff_t xpostpos;
+
+ void advance() {
+ dst = (char *)dst + dstpitch;
+ src = (char *)src + srcinc;
+
+ uint32 vt = v + dvdy;
+
+ if (vt < v)
+ src = (char *)src + srcpitch;
+
+ v = vt;
+ }
+ };
+}
+
+void VDPixmapStretchBlt_Any8_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+ do {
+ uint8 *dstp = (uint8 *)params.dst;
+ const uint8 *srcp = (const uint8 *)params.src;
+ uint32 u = params.u;
+
+ if (params.xprecopy) {
+ VDMemset8(dstp, *(const uint8 *)((const char *)params.src + params.xprepos), params.xprecopy);
+ dstp += params.xprecopy;
+ }
+
+ sint32 wt = params.dx;
+
+ if (wt > 0)
+ do {
+ *dstp++ = *srcp;
+
+ uint32 ut = u + params.dudx;
+ srcp += ut<u;
+ srcp += params.uinc;
+ u = ut;
+ } while(--wt);
+
+ if (params.xpostcopy)
+ VDMemset8(dstp, *(const uint8 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+ params.advance();
+ } while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any16_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+ do {
+ uint16 *dstp = (uint16 *)params.dst;
+ const uint16 *srcp = (const uint16 *)params.src;
+ uint32 u = params.u;
+
+ if (params.xprecopy) {
+ VDMemset16(dstp, *(const uint16 *)((const char *)params.src + params.xprepos), params.xprecopy);
+ dstp += params.xprecopy;
+ }
+
+ sint32 wt = params.dx;
+
+ if (wt > 0)
+ do {
+ *dstp++ = *srcp;
+
+ uint32 ut = u + params.dudx;
+ srcp += ut<u;
+ srcp += params.uinc;
+ u = ut;
+ } while(--wt);
+
+ if (params.xpostcopy)
+ VDMemset16(dstp, *(const uint16 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+ params.advance();
+ } while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any24_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+ do {
+ uint8 *dstp = (uint8 *)params.dst;
+ const uint8 *srcp = (const uint8 *)params.src;
+ uint32 u = params.u;
+
+ if (params.xprecopy) {
+ const uint8 *repsrc = (const uint8 *)params.src + params.xprepos;
+ const uint8 p0 = repsrc[0];
+ const uint8 p1 = repsrc[1];
+ const uint8 p2 = repsrc[2];
+
+ for(sint32 i=0; i<params.xprecopy; ++i) {
+ dstp[0] = p0;
+ dstp[1] = p1;
+ dstp[2] = p2;
+ dstp += 3;
+ }
+ }
+
+ sint32 wt = params.dx;
+
+ if (wt > 0)
+ do {
+ dstp[0] = srcp[0];
+ dstp[1] = srcp[1];
+ dstp[2] = srcp[2];
+ dstp += 3;
+
+ uint32 ut = u + params.dudx;
+ srcp += (ut<u)*3;
+ srcp += params.uinc*3;
+ u = ut;
+ } while(--wt);
+
+ if (params.xpostcopy) {
+ const uint8 *repsrc = (const uint8 *)params.src + params.xpostpos;
+ const uint8 p0 = repsrc[0];
+ const uint8 p1 = repsrc[1];
+ const uint8 p2 = repsrc[2];
+
+ for(sint32 i=0; i<params.xpostcopy; ++i) {
+ dstp[0] = p0;
+ dstp[1] = p1;
+ dstp[2] = p2;
+ dstp += 3;
+ }
+ }
+
+ params.advance();
+ } while(--params.dy);
+}
+
+void VDPixmapStretchBlt_Any32_nearest_reference(VDPixmapReferenceStretchBltParameters params) {
+ do {
+ uint32 *dstp = (uint32 *)params.dst;
+ const uint32 *srcp = (const uint32 *)params.src;
+ uint32 u = params.u;
+
+ if (params.xprecopy) {
+ VDMemset32(dstp, *(const uint32 *)((const char *)params.src + params.xprepos), params.xprecopy);
+ dstp += params.xprecopy;
+ }
+
+ sint32 wt = params.dx;
+ if (wt > 0)
+ do {
+ *dstp++ = *srcp;
+
+ uint32 ut = u + params.dudx;
+ srcp += ut<u;
+ srcp += params.uinc;
+ u = ut;
+ } while(--wt);
+
+ if (params.xpostcopy)
+ VDMemset32(dstp, *(const uint32 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+
+ params.advance();
+ } while(--params.dy);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+ void VDSetupNearestSamplingParameters(sint64& u64, sint64 dudx, sint32 dx, sint32 du, sint32& xprecopy, sint32& xprepos, sint32& xmain, sint32& xpostcopy, sint32& xpostpos) {
+ sint64 ulo = u64;
+ sint64 uhi = u64 + dudx * (dx - 1);
+ sint64 tdudx = dudx;
+ const sint64 ulimit = ((sint64)du << 32);
+
+ xprepos = 0;
+ xpostpos = du-1;
+
+ if (!tdudx) {
+ if (u64 < 0)
+ xprecopy = dx;
+ else if (u64 >= ulimit)
+ xprecopy = dx;
+ else
+ xmain = dx;
+ } else {
+ if (tdudx < 0) {
+ std::swap(ulo, uhi);
+ tdudx = -tdudx;
+ }
+
+ if (ulo < 0) {
+ if (uhi < 0)
+ xprecopy = dx;
+ else
+ xprecopy = (sint32)((-ulo-1) / tdudx) + 1;
+
+ VDASSERT(xprecopy <= 0 || (uint64)ulo >= (uint64)ulimit);
+ VDASSERT(xprecopy <= 0 || (uint64)(ulo + tdudx * (xprecopy-1)) >= (uint64)ulimit);
+ }
+
+ if (uhi >= ulimit) {
+ if (ulo >= ulimit)
+ xpostcopy = dx;
+ else
+ xpostcopy = (sint32)((uhi - ulimit) / tdudx) + 1;
+
+ VDASSERT(xpostcopy <= 0 || (uint64)uhi >= (uint64)ulimit);
+ VDASSERT(xpostcopy <= 0 || (uint64)(uhi - tdudx * (xpostcopy - 1)) >= (uint64)ulimit);
+ }
+
+ if (dudx < 0) {
+ std::swap(xprecopy, xpostcopy);
+ std::swap(xprepos, xpostpos);
+ }
+
+ xmain = dx - (xprecopy + xpostcopy);
+ }
+
+ // sanity-check parameters
+
+ VDASSERT(xprecopy>=0 && xprecopy <= dx);
+ VDASSERT(xpostcopy>=0 && xpostcopy <= dx);
+ VDASSERT(xmain>=0 && xmain <= dx);
+
+ VDASSERT(xprecopy <= 0 || (uint64)u64 >= (uint64)ulimit);
+ VDASSERT(xprecopy <= 0 || (uint64)(u64 + dudx * (xprecopy-1)) >= (uint64)ulimit);
+ VDASSERT(xmain <= 0 || (uint64)(u64 + dudx * xprecopy) < (uint64)ulimit);
+ VDASSERT(xmain <= 0 || (uint64)(u64 + dudx * (xprecopy+xmain-1)) < (uint64)ulimit);
+ VDASSERT(xpostcopy <= 0 || (uint64)(u64 + dudx * (xprecopy + xmain)) >= (uint64)ulimit);
+ VDASSERT(xpostcopy <= 0 || (uint64)(u64 + dudx * (xprecopy + xmain + xpostcopy - 1)) >= (uint64)ulimit);
+
+ u64 += dudx * xprecopy;
+ }
+}
+
+bool VDPixmapStretchBltNearest_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+ // we don't support format conversion
+ if (dst.format != src.format)
+ return false;
+
+ void (*pBlitter)(VDPixmapReferenceStretchBltParameters);
+ int bpp;
+
+ switch(src.format) {
+ case nsVDPixmap::kPixFormat_Pal8:
+ pBlitter = VDPixmapStretchBlt_Any8_nearest_reference;
+ bpp = 1;
+ break;
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ case nsVDPixmap::kPixFormat_RGB565:
+ pBlitter = VDPixmapStretchBlt_Any16_nearest_reference;
+ bpp = 2;
+ break;
+ case nsVDPixmap::kPixFormat_RGB888:
+ pBlitter = VDPixmapStretchBlt_Any24_nearest_reference;
+ bpp = 3;
+ break;
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ pBlitter = VDPixmapStretchBlt_Any32_nearest_reference;
+ bpp = 4;
+ break;
+ default:
+ return false;
+ }
+
+ // preemptive clip to prevent gradient calculations from crashing
+ if (x2 == x1 || y2 == y1)
+ return true;
+
+ // translate destination flips into source flips
+ if (x1 > x2) {
+ std::swap(x1, x2);
+ std::swap(u1, u2);
+ }
+
+ if (y1 > y2) {
+ std::swap(y1, y2);
+ std::swap(v1, v2);
+ }
+
+ // compute gradients
+ sint32 dx = x2 - x1;
+ sint32 dy = y2 - y1;
+ sint32 du = u2 - u1;
+ sint32 dv = v2 - v1;
+ sint64 dudx = ((sint64)du << 32) / dx; // must truncate toward zero to prevent overflow
+ sint64 dvdy = ((sint64)dv << 32) / dy;
+
+ // prestep top-left point to pixel center and convert destination coordinates to integer
+ sint64 u64 = (sint64)u1 << 16;
+ sint64 v64 = (sint64)v1 << 16;
+ sint32 prestepx = (0x8000 - x1) & 0xffff;
+ sint32 prestepy = (0x8000 - y1) & 0xffff;
+
+ u64 += (dudx * prestepx) >> 16;
+ v64 += (dvdy * prestepy) >> 16;
+
+ sint32 x1i = (x1 + 0x8000) >> 16;
+ sint32 y1i = (y1 + 0x8000) >> 16;
+ sint32 x2i = (x2 + 0x8000) >> 16;
+ sint32 y2i = (y2 + 0x8000) >> 16;
+
+ // destination clipping
+ if (x1i < 0) {
+ u64 -= dudx * x1i;
+ x1i = 0;
+ }
+
+ if (y1i < 0) {
+ v64 -= dvdy * y1i;
+ y1i = 0;
+ }
+
+ if (x2i > dst.w)
+ x2i = dst.w;
+
+ if (y2i > dst.h)
+ y2i = dst.h;
+
+ if (x1i >= x2i || y1i >= y2i)
+ return true;
+
+ // Calculate horizontal clip parameters
+ sint32 xprecopy = 0, xpostcopy = 0;
+ int xprepos = 0;
+ int xpostpos = src.w-1;
+ int xmain = 0;
+
+ VDSetupNearestSamplingParameters(u64, dudx, x2i-x1i, src.w, xprecopy, xprepos, xmain, xpostcopy, xpostpos);
+
+ // Calculate vertical clip parameters
+ sint32 yprecopy = 0, ypostcopy = 0;
+ int yprepos = 0;
+ int ypostpos = src.h-1;
+ int ymain = 0;
+
+ VDSetupNearestSamplingParameters(v64, dvdy, y2i-y1i, src.h, yprecopy, yprepos, ymain, ypostcopy, ypostpos);
+
+ // set up parameter block
+ VDPixmapReferenceStretchBltParameters params;
+
+ char *srcbase = (char *)src.data + (sint32)(u64 >> 32) * bpp;
+
+ params.dst = (char *)dst.data + y1i * dst.pitch + x1i * bpp;
+ params.dstpitch = dst.pitch;
+ params.src = srcbase + (sint32)(v64 >> 32) * src.pitch;
+ params.srcpitch = src.pitch;
+ params.srcinc = (sint32)(dvdy >> 32) * src.pitch;
+ params.dx = xmain;
+ params.dy = ymain;
+ params.u = (uint32)u64;
+ params.uinc = (uint32)(dudx >> 32);
+ params.dudx = (uint32)dudx;
+ params.v = (uint32)v64;
+ params.dvdy = (uint32)dvdy;
+ params.xprecopy = xprecopy;
+ params.xprepos = (xprepos - (sint32)(u64 >> 32)) * bpp;
+ params.xpostcopy = xpostcopy;
+ params.xpostpos = (xpostpos - (sint32)(u64 >> 32)) * bpp;
+
+ if (yprecopy > 0) {
+ VDPixmapReferenceStretchBltParameters preparams(params);
+
+ preparams.src = srcbase + yprepos * src.pitch;
+ preparams.srcinc = 0;
+ preparams.dy = yprecopy;
+ preparams.v = 0;
+ preparams.dvdy = 0;
+
+ pBlitter(preparams);
+
+ params.dst = (char *)params.dst + params.dstpitch * yprecopy;
+ }
+
+ if (ymain > 0)
+ pBlitter(params);
+
+ if (ypostcopy > 0) {
+ VDPixmapReferenceStretchBltParameters postparams(params);
+
+ postparams.dst = (char *)params.dst + params.dstpitch * params.dy;
+ postparams.src = srcbase + ypostpos * src.pitch;
+ postparams.srcpitch = 0;
+ postparams.srcinc = 0;
+ postparams.dy = ypostcopy;
+ postparams.v = 0;
+ postparams.dvdy = 0;
+
+ pBlitter(postparams);
+ }
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+ uint32 lerp_XRGB1555(sint32 a, sint32 b, sint32 f) {
+ sint32 a_rb = a & 0x7c1f;
+ sint32 a_g = a & 0x03e0;
+ sint32 b_rb = b & 0x7c1f;
+ sint32 b_g = b & 0x03e0;
+
+ const sint32 rb = (a_rb + (((b_rb - a_rb)*f + 0x4010) >> 5)) & 0x7c1f;
+ const sint32 g = (a_g + (((b_g - a_g )*f + 0x0200) >> 5)) & 0x03e0;
+
+ return rb + g;
+ }
+
+ uint32 lerp_XRGB8888(sint32 a, sint32 b, sint32 f) {
+ sint32 a_rb = a & 0xff00ff;
+ sint32 a_g = a & 0x00ff00;
+ sint32 b_rb = b & 0xff00ff;
+ sint32 b_g = b & 0x00ff00;
+
+ const uint32 rb = (a_rb + (((b_rb - a_rb)*f + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 g = (a_g + (((b_g - a_g )*f + 0x00008000) >> 8)) & 0x00ff00;
+
+ return rb + g;
+ }
+
+ uint32 bilerp_RGB888(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+ sint32 a_rb = a & 0xff00ff;
+ sint32 a_g = a & 0x00ff00;
+ sint32 b_rb = b & 0xff00ff;
+ sint32 b_g = b & 0x00ff00;
+ sint32 c_rb = c & 0xff00ff;
+ sint32 c_g = c & 0x00ff00;
+ sint32 d_rb = d & 0xff00ff;
+ sint32 d_g = d & 0x00ff00;
+
+ const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 top_g = (a_g + (((b_g - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+ const uint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 bot_g = (c_g + (((d_g - c_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+ const uint32 final_rb = (top_rb + (((bot_rb - top_rb)*y) >> 8)) & 0xff00ff;
+ const uint32 final_g = (top_g + (((bot_g - top_g )*y) >> 8)) & 0x00ff00;
+
+ return final_rb + final_g;
+ }
+
+ uint32 bilerp_XRGB1555(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+ sint32 a_rb = a & 0x7c1f;
+ sint32 a_g = a & 0x03e0;
+ sint32 b_rb = b & 0x7c1f;
+ sint32 b_g = b & 0x03e0;
+ sint32 c_rb = c & 0x7c1f;
+ sint32 c_g = c & 0x03e0;
+ sint32 d_rb = d & 0x7c1f;
+ sint32 d_g = d & 0x03e0;
+
+ const sint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x4010) >> 5)) & 0x7c1f;
+ const sint32 top_g = (a_g + (((b_g - a_g )*x + 0x0200) >> 5)) & 0x03e0;
+ const sint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x4010) >> 5)) & 0x7c1f;
+ const sint32 bot_g = (c_g + (((d_g - c_g )*x + 0x0200) >> 5)) & 0x03e0;
+
+ const sint32 final_rb = (top_rb + (((bot_rb - top_rb)*y + 0x4010) >> 5)) & 0x7c1f;
+ const sint32 final_g = (top_g + (((bot_g - top_g )*y + 0x0200) >> 5)) & 0x03e0;
+
+ return final_rb + final_g;
+ }
+
+ uint32 bilerp_RGB565(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+ sint32 a_rb = a & 0xf81f;
+ sint32 a_g = a & 0x07e0;
+ sint32 b_rb = b & 0xf81f;
+ sint32 b_g = b & 0x07e0;
+ sint32 c_rb = c & 0xf81f;
+ sint32 c_g = c & 0x07e0;
+ sint32 d_rb = d & 0xf81f;
+ sint32 d_g = d & 0x07e0;
+
+ const sint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x8010) >> 6)) & 0xf81f;
+ const sint32 top_g = (a_g + (((b_g - a_g )*x + 0x0400) >> 6)) & 0x07e0;
+ const sint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x8010) >> 6)) & 0xf81f;
+ const sint32 bot_g = (c_g + (((d_g - c_g )*x + 0x0400) >> 6)) & 0x07e0;
+
+ const sint32 final_rb = (top_rb + (((bot_rb - top_rb)*y + 0x8010) >> 6)) & 0xf81f;
+ const sint32 final_g = (top_g + (((bot_g - top_g )*y + 0x0400) >> 6)) & 0x07e0;
+
+ return final_rb + final_g;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace {
+ struct VDPixmapReferenceStretchBltBilinearParameters {
+ void *dst;
+ const void *src;
+ uint32 u;
+ uint32 uinc;
+ uint32 dudx;
+
+ ptrdiff_t xprepos;
+ ptrdiff_t xpostpos;
+ sint32 xprecopy;
+ sint32 xpostcopy;
+ sint32 xmidsize;
+ };
+
+ void VDPixmapStretchBiH_XRGB1555_to_XRGB1555(const VDPixmapReferenceStretchBltBilinearParameters& params) {
+ uint16 *dst = (uint16 *)params.dst;
+ const uint16 *src = (const uint16 *)params.src;
+
+ if (params.xprecopy)
+ VDMemset16(dst - params.xprecopy, *(const uint16 *)((const char *)params.src + params.xprepos), params.xprecopy);
+
+ if (params.xmidsize) {
+ sint32 w = params.xmidsize;
+ uint32 u = params.u;
+ const uint32 dudx = params.dudx;
+ const ptrdiff_t uinc = params.uinc;
+
+ do {
+ *dst++ = lerp_XRGB1555(src[0], src[1], u >> 27);
+
+ const uint32 ut = u + dudx;
+ src += uinc + (ut < u);
+ u = ut;
+ } while(--w);
+ }
+
+ if (params.xpostcopy)
+ VDMemset16(dst, *(const uint16 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+ }
+
+ void VDPixmapStretchBiH_XRGB8888_to_XRGB8888(const VDPixmapReferenceStretchBltBilinearParameters& params) {
+ uint32 *dst = (uint32 *)params.dst;
+ const uint32 *src = (const uint32 *)params.src;
+
+ if (params.xprecopy)
+ VDMemset32(dst - params.xprecopy, *(const uint32 *)((const char *)params.src + params.xprepos), params.xprecopy);
+
+ if (params.xmidsize) {
+ sint32 w = params.xmidsize;
+ uint32 u = params.u;
+ const uint32 dudx = params.dudx;
+ const ptrdiff_t uinc = params.uinc;
+
+ do {
+ *dst++ = lerp_XRGB8888(src[0], src[1], u >> 24);
+
+ const uint32 ut = u + dudx;
+ src += uinc + (ut < u);
+ u = ut;
+ } while(--w);
+ }
+
+ if (params.xpostcopy)
+ VDMemset32(dst, *(const uint32 *)((const char *)params.src + params.xpostpos), params.xpostcopy);
+ }
+
+ void VDPixmapStretchBiV_XRGB1555_to_XRGB1555(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f) {
+ uint16 *dst = (uint16 *)dstv;
+ const uint16 *src1 = (const uint16 *)src1v;
+ const uint16 *src2 = (const uint16 *)src2v;
+
+ f >>= 27;
+
+ do {
+ *dst++ = lerp_XRGB1555(*src1++, *src2++, f);
+ } while(--w);
+ }
+
+ void VDPixmapStretchBiV_XRGB8888_to_XRGB8888(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f) {
+ uint32 *dst = (uint32 *)dstv;
+ const uint32 *src1 = (const uint32 *)src1v;
+ const uint32 *src2 = (const uint32 *)src2v;
+
+ f >>= 24;
+
+ do {
+ *dst++ = lerp_XRGB8888(*src1++, *src2++, f);
+ } while(--w);
+ }
+}
+
+#ifdef _M_IX86
+extern "C" void vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX(const VDPixmapReferenceStretchBltBilinearParameters&);
+
+extern "C" void vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+extern "C" void vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+#endif
+
+bool VDPixmapStretchBltBilinear_reference(const VDPixmap& dst, sint32 x1, sint32 y1, sint32 x2, sint32 y2, const VDPixmap& src, sint32 u1, sint32 v1, sint32 u2, sint32 v2) {
+ // preemptive clip to prevent gradient calculations from crashing
+ if (x2 == x1 || y2 == y1)
+ return true;
+
+ // we don't support source clipping
+ if ((uint32)u1 > (uint32)(src.w << 16) || (uint32)v1 > (uint32)(src.h << 16))
+ return false;
+
+ if ((uint32)u2 > (uint32)(src.w << 16) || (uint32)v2 > (uint32)(src.h << 16))
+ return false;
+
+ // we don't support format changes (yet)
+ if (dst.format != src.format)
+ return false;
+
+ // format determination
+ void (*pHorizontalFilter)(const VDPixmapReferenceStretchBltBilinearParameters& params);
+ void (*pVerticalFilter)(void *dstv, const void *src1v, const void *src2v, sint32 w, uint32 f);
+ int bpp;
+
+#pragma vdpragma_TODO("fixme this is b0rken")
+ switch(src.format) {
+ case nsVDPixmap::kPixFormat_XRGB1555:
+ pHorizontalFilter = VDPixmapStretchBiH_XRGB1555_to_XRGB1555;
+#ifdef _M_IX86
+ if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX)
+ pVerticalFilter = vdasm_stretchbltV_XRGB1555_to_XRGB1555_MMX;
+ else
+#endif
+ pVerticalFilter = VDPixmapStretchBiV_XRGB1555_to_XRGB1555;
+ bpp = 2;
+ break;
+ case nsVDPixmap::kPixFormat_XRGB8888:
+#ifdef _M_IX86
+ if (CPUGetEnabledExtensions() & CPUF_SUPPORTS_MMX) {
+ pHorizontalFilter = vdasm_stretchbltH_XRGB8888_to_XRGB8888_MMX;
+ pVerticalFilter = vdasm_stretchbltV_XRGB8888_to_XRGB8888_MMX;
+ } else
+#endif
+ {
+ pHorizontalFilter = VDPixmapStretchBiH_XRGB8888_to_XRGB8888;
+ pVerticalFilter = VDPixmapStretchBiV_XRGB8888_to_XRGB8888;
+ }
+ bpp = 4;
+ break;
+ default:
+ return false;
+ }
+
+ // translate destination flips into source flips
+ if (x1 > x2) {
+ std::swap(x1, x2);
+ std::swap(u1, u2);
+ }
+
+ if (y1 > y2) {
+ std::swap(y1, y2);
+ std::swap(v1, v2);
+ }
+
+ // compute gradients
+ sint32 dx = x2 - x1;
+ sint32 dy = y2 - y1;
+ sint32 du = u2 - u1;
+ sint32 dv = v2 - v1;
+ sint64 dudx = ((sint64)du << 32) / dx; // must truncate toward zero to prevent overflow
+ sint64 dvdy = ((sint64)dv << 32) / dy;
+
+ // prestep top-left point to pixel center and convert destination coordinates to integer
+ sint64 u64 = (sint64)u1 << 16;
+ sint64 v64 = (sint64)v1 << 16;
+ sint32 prestepx = (0x8000 - x1) & 0xffff;
+ sint32 prestepy = (0x8000 - y1) & 0xffff;
+
+ u64 += (dudx * prestepx) >> 16;
+ v64 += (dvdy * prestepy) >> 16;
+
+ sint32 x1i = (x1 + 0x8000) >> 16;
+ sint32 y1i = (y1 + 0x8000) >> 16;
+ sint32 x2i = (x2 + 0x8000) >> 16;
+ sint32 y2i = (y2 + 0x8000) >> 16;
+
+ // destination clipping
+ if (x1i < 0) {
+ u64 -= dudx * x1i;
+ x1i = 0;
+ }
+
+ if (y1i < 0) {
+ v64 -= dvdy * y1i;
+ y1i = 0;
+ }
+
+ if (x2i > dst.w)
+ x2i = dst.w;
+
+ if (y2i > dst.h)
+ y2i = dst.h;
+
+ if (x1i >= x2i || y1i >= y2i)
+ return true;
+
+ u64 -= 0x80000000;
+ v64 -= 0x80000000;
+
+ int xprepos = 0;
+ int xpostpos = src.w-1;
+
+ sint64 ulo = u64;
+ sint64 uhi = u64 + dudx * (x2i - x1i - 1);
+ sint64 tdudx = dudx;
+
+ if (ulo > uhi) {
+ std::swap(ulo, uhi);
+ tdudx = -tdudx;
+ }
+
+ int xprecopy = 0;
+ int xpostcopy = 0;
+
+ if (ulo < 0) {
+ xprecopy = (int)((1 - ulo) / tdudx) + 1;
+ }
+
+ const sint64 ulimit = ((sint64)(src.w-1) << 32);
+
+ if (uhi >= ulimit)
+ xpostcopy = (int)((uhi - ulimit - 1) / tdudx) + 1;
+
+ if (dudx < 0) {
+ std::swap(xprecopy, xpostcopy);
+ std::swap(xprepos, xpostpos);
+ }
+
+ u64 += dudx * xprecopy;
+ const int xtotal = x2i - x1i;
+ int xmidcopy = (x2i - x1i) - (xprecopy + xpostcopy);
+ const sint32 ui = (sint32)(u64 >> 32);
+
+ // set up parameter block
+
+ VDPixmapReferenceStretchBltBilinearParameters params;
+
+ params.u = (uint32)u64;
+ params.uinc = (sint32)(dudx >> 32);
+ params.dudx = (sint32)dudx;
+ params.xprecopy = xprecopy;
+ params.xprepos = (xprepos - ui) * bpp;
+ params.xpostcopy = xpostcopy;
+ params.xpostpos = (xpostpos - ui) * bpp;
+ params.xmidsize = xmidcopy;
+
+ void *dstp = (char *)dst.data + y1i * dst.pitch + x1i * bpp;
+ const void *srcp = (char *)src.data + ui * bpp;
+
+ VDPixmapBuffer window(xtotal, 2, src.format);
+
+ void *pTempRow1 = window.data;
+ void *pTempRow2 = (char *)window.data + window.pitch;
+ int windowbottom = dvdy > 0 ? -0x7fffffff : 0x7fffffff;
+
+ do {
+ sint32 iv = (sint32)(v64 >> 32);
+ sint32 iv_bottom = iv + 1;
+
+ if (iv < 0)
+ iv = iv_bottom = 0;
+
+ if (iv >= src.h-1)
+ iv = iv_bottom = src.h-1;
+
+ if (dvdy < 0) {
+ if (windowbottom > iv_bottom+1)
+ windowbottom = iv_bottom+1;
+
+ while(windowbottom > iv) {
+ std::swap(pTempRow1, pTempRow2);
+
+ --windowbottom;
+
+ params.dst = (char *)pTempRow1 + bpp * params.xprecopy;
+ params.src = vdptroffset(srcp, windowbottom * src.pitch);
+
+ pHorizontalFilter(params);
+ }
+ } else {
+ if (windowbottom < iv-1)
+ windowbottom = iv-1;
+
+ while(windowbottom < iv_bottom) {
+ std::swap(pTempRow1, pTempRow2);
+
+ ++windowbottom;
+
+ params.dst = (char *)pTempRow2 + bpp * params.xprecopy;
+ params.src = vdptroffset(srcp, windowbottom * src.pitch);
+
+ pHorizontalFilter(params);
+ }
+ }
+
+ if (iv == iv_bottom)
+ if (dvdy < 0)
+ pVerticalFilter(dstp, pTempRow1, pTempRow1, xtotal, 0);
+ else
+ pVerticalFilter(dstp, pTempRow2, pTempRow2, xtotal, 0);
+ else
+ pVerticalFilter(dstp, pTempRow1, pTempRow2, xtotal, (uint32)v64);
+
+ v64 += dvdy;
+ dstp = (char *)dstp + dst.pitch;
+ } while(++y1i < y2i);
+
+ return true;
+} \ No newline at end of file
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp b/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp
new file mode 100644
index 000000000..bf1987500
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/tables.cpp
@@ -0,0 +1,204 @@
+// Automatically generated by Asuka 'maketables.'" DO NOT EDIT!
+
+#include <vd2/system/vdtypes.h>
+
+extern "C" const sint32 kVDCubicInterpTableFX14_075[256][4]={
+ { 0, 16384, 0, 0 }, { -48, 16384, 48, 0 }, { -95, 16383, 97, -1 }, { -141, 16380, 147, -2 },
+ { -186, 16375, 198, -3 }, { -231, 16371, 249, -5 }, { -275, 16365, 301, -7 }, { -318, 16357, 354, -9 },
+ { -360, 16349, 407, -12 }, { -402, 16340, 461, -15 }, { -443, 16329, 516, -18 }, { -484, 16318, 572, -22 },
+ { -523, 16305, 628, -26 }, { -562, 16291, 685, -30 }, { -601, 16278, 742, -35 }, { -638, 16262, 800, -40 },
+ { -675, 16245, 859, -45 }, { -711, 16228, 918, -51 }, { -747, 16209, 978, -56 }, { -782, 16190, 1039, -63 },
+ { -816, 16169, 1100, -69 }, { -849, 16147, 1162, -76 }, { -882, 16124, 1225, -83 }, { -915, 16101, 1288, -90 },
+ { -946, 16077, 1351, -98 }, { -977, 16052, 1415, -106 }, { -1007, 16025, 1480, -114 }, { -1037, 15998, 1545, -122 },
+ { -1066, 15970, 1611, -131 }, { -1094, 15940, 1678, -140 }, { -1122, 15910, 1745, -149 }, { -1149, 15879, 1812, -158 },
+ { -1176, 15848, 1880, -168 }, { -1202, 15815, 1949, -178 }, { -1227, 15781, 2018, -188 }, { -1252, 15747, 2087, -198 },
+ { -1276, 15712, 2157, -209 }, { -1300, 15676, 2228, -220 }, { -1323, 15639, 2299, -231 }, { -1345, 15601, 2370, -242 },
+ { -1367, 15562, 2442, -253 }, { -1388, 15523, 2514, -265 }, { -1409, 15482, 2587, -276 }, { -1429, 15441, 2660, -288 },
+ { -1448, 15399, 2734, -301 }, { -1467, 15356, 2808, -313 }, { -1486, 15312, 2883, -325 }, { -1504, 15268, 2958, -338 },
+ { -1521, 15223, 3033, -351 }, { -1538, 15177, 3109, -364 }, { -1554, 15130, 3185, -377 }, { -1570, 15084, 3261, -391 },
+ { -1585, 15035, 3338, -404 }, { -1600, 14986, 3416, -418 }, { -1614, 14936, 3493, -431 }, { -1627, 14885, 3571, -445 },
+ { -1641, 14834, 3650, -459 }, { -1653, 14783, 3728, -474 }, { -1665, 14730, 3807, -488 }, { -1677, 14676, 3887, -502 },
+ { -1688, 14623, 3966, -517 }, { -1699, 14568, 4046, -531 }, { -1709, 14512, 4127, -546 }, { -1719, 14457, 4207, -561 },
+ { -1728, 14400, 4288, -576 }, { -1737, 14343, 4369, -591 }, { -1745, 14284, 4451, -606 }, { -1753, 14226, 4532, -621 },
+ { -1760, 14167, 4614, -637 }, { -1767, 14107, 4696, -652 }, { -1774, 14047, 4779, -668 }, { -1780, 13986, 4861, -683 },
+ { -1785, 13924, 4944, -699 }, { -1791, 13861, 5028, -714 }, { -1795, 13798, 5111, -730 }, { -1800, 13736, 5194, -746 },
+ { -1804, 13671, 5278, -761 }, { -1807, 13606, 5362, -777 }, { -1810, 13541, 5446, -793 }, { -1813, 13475, 5531, -809 },
+ { -1815, 13409, 5615, -825 }, { -1817, 13342, 5700, -841 }, { -1818, 13275, 5784, -857 }, { -1819, 13207, 5869, -873 },
+ { -1820, 13139, 5954, -889 }, { -1820, 13069, 6040, -905 }, { -1820, 13000, 6125, -921 }, { -1820, 12930, 6211, -937 },
+ { -1819, 12860, 6296, -953 }, { -1818, 12789, 6382, -969 }, { -1816, 12717, 6468, -985 }, { -1815, 12647, 6553, -1001 },
+ { -1812, 12574, 6639, -1017 }, { -1810, 12502, 6725, -1033 }, { -1807, 12427, 6812, -1048 }, { -1804, 12354, 6898, -1064 },
+ { -1800, 12280, 6984, -1080 }, { -1796, 12206, 7070, -1096 }, { -1792, 12130, 7157, -1111 }, { -1787, 12055, 7243, -1127 },
+ { -1782, 11980, 7329, -1143 }, { -1777, 11903, 7416, -1158 }, { -1772, 11827, 7502, -1173 }, { -1766, 11751, 7588, -1189 },
+ { -1760, 11673, 7675, -1204 }, { -1753, 11595, 7761, -1219 }, { -1747, 11517, 7848, -1234 }, { -1740, 11439, 7934, -1249 },
+ { -1733, 11361, 8020, -1264 }, { -1725, 11281, 8107, -1279 }, { -1717, 11202, 8193, -1294 }, { -1709, 11123, 8279, -1309 },
+ { -1701, 11043, 8365, -1323 }, { -1692, 10962, 8451, -1337 }, { -1684, 10883, 8537, -1352 }, { -1675, 10802, 8623, -1366 },
+ { -1665, 10720, 8709, -1380 }, { -1656, 10640, 8794, -1394 }, { -1646, 10557, 8880, -1407 }, { -1636, 10476, 8965, -1421 },
+ { -1626, 10393, 9051, -1434 }, { -1615, 10311, 9136, -1448 }, { -1604, 10228, 9221, -1461 }, { -1594, 10146, 9306, -1474 },
+ { -1582, 10062, 9391, -1487 }, { -1571, 9979, 9475, -1499 }, { -1560, 9896, 9560, -1512 }, { -1548, 9812, 9644, -1524 },
+ { -1536, 9728, 9728, -1536 }, { -1524, 9644, 9812, -1548 }, { -1512, 9560, 9896, -1560 }, { -1499, 9475, 9979, -1571 },
+ { -1487, 9391, 10062, -1582 }, { -1474, 9306, 10146, -1594 }, { -1461, 9221, 10228, -1604 }, { -1448, 9136, 10311, -1615 },
+ { -1434, 9051, 10393, -1626 }, { -1421, 8965, 10476, -1636 }, { -1407, 8880, 10557, -1646 }, { -1394, 8795, 10639, -1656 },
+ { -1380, 8709, 10720, -1665 }, { -1366, 8624, 10801, -1675 }, { -1352, 8538, 10882, -1684 }, { -1337, 8450, 10963, -1692 },
+ { -1323, 8365, 11043, -1701 }, { -1309, 8279, 11123, -1709 }, { -1294, 8192, 11203, -1717 }, { -1279, 8106, 11282, -1725 },
+ { -1264, 8020, 11361, -1733 }, { -1249, 7934, 11439, -1740 }, { -1234, 7847, 11518, -1747 }, { -1219, 7760, 11596, -1753 },
+ { -1204, 7675, 11673, -1760 }, { -1189, 7589, 11750, -1766 }, { -1173, 7502, 11827, -1772 }, { -1158, 7415, 11904, -1777 },
+ { -1143, 7329, 11980, -1782 }, { -1127, 7243, 12055, -1787 }, { -1111, 7156, 12131, -1792 }, { -1096, 7070, 12206, -1796 },
+ { -1080, 6984, 12280, -1800 }, { -1064, 6898, 12354, -1804 }, { -1048, 6811, 12428, -1807 }, { -1033, 6726, 12501, -1810 },
+ { -1017, 6639, 12574, -1812 }, { -1001, 6554, 12646, -1815 }, { -985, 6467, 12718, -1816 }, { -969, 6382, 12789, -1818 },
+ { -953, 6296, 12860, -1819 }, { -937, 6211, 12930, -1820 }, { -921, 6125, 13000, -1820 }, { -905, 6039, 13070, -1820 },
+ { -889, 5954, 13139, -1820 }, { -873, 5869, 13207, -1819 }, { -857, 5784, 13275, -1818 }, { -841, 5700, 13342, -1817 },
+ { -825, 5615, 13409, -1815 }, { -809, 5531, 13475, -1813 }, { -793, 5446, 13541, -1810 }, { -777, 5362, 13606, -1807 },
+ { -761, 5278, 13671, -1804 }, { -746, 5195, 13735, -1800 }, { -730, 5111, 13798, -1795 }, { -714, 5028, 13861, -1791 },
+ { -699, 4944, 13924, -1785 }, { -683, 4862, 13985, -1780 }, { -668, 4780, 14046, -1774 }, { -652, 4696, 14107, -1767 },
+ { -637, 4614, 14167, -1760 }, { -621, 4532, 14226, -1753 }, { -606, 4450, 14285, -1745 }, { -591, 4369, 14343, -1737 },
+ { -576, 4288, 14400, -1728 }, { -561, 4207, 14457, -1719 }, { -546, 4126, 14513, -1709 }, { -531, 4046, 14568, -1699 },
+ { -517, 3966, 14623, -1688 }, { -502, 3886, 14677, -1677 }, { -488, 3807, 14730, -1665 }, { -474, 3728, 14783, -1653 },
+ { -459, 3650, 14834, -1641 }, { -445, 3570, 14886, -1627 }, { -431, 3493, 14936, -1614 }, { -418, 3416, 14986, -1600 },
+ { -404, 3338, 15035, -1585 }, { -391, 3262, 15083, -1570 }, { -377, 3185, 15130, -1554 }, { -364, 3109, 15177, -1538 },
+ { -351, 3033, 15223, -1521 }, { -338, 2958, 15268, -1504 }, { -325, 2882, 15313, -1486 }, { -313, 2808, 15356, -1467 },
+ { -301, 2734, 15399, -1448 }, { -288, 2660, 15441, -1429 }, { -276, 2587, 15482, -1409 }, { -265, 2514, 15523, -1388 },
+ { -253, 2442, 15562, -1367 }, { -242, 2370, 15601, -1345 }, { -231, 2299, 15639, -1323 }, { -220, 2228, 15676, -1300 },
+ { -209, 2157, 15712, -1276 }, { -198, 2087, 15747, -1252 }, { -188, 2017, 15782, -1227 }, { -178, 1949, 15815, -1202 },
+ { -168, 1880, 15848, -1176 }, { -158, 1811, 15880, -1149 }, { -149, 1744, 15911, -1122 }, { -140, 1677, 15941, -1094 },
+ { -131, 1611, 15970, -1066 }, { -122, 1545, 15998, -1037 }, { -114, 1480, 16025, -1007 }, { -106, 1415, 16052, -977 },
+ { -98, 1351, 16077, -946 }, { -90, 1288, 16101, -915 }, { -83, 1224, 16125, -882 }, { -76, 1162, 16147, -849 },
+ { -69, 1100, 16169, -816 }, { -63, 1040, 16189, -782 }, { -56, 978, 16209, -747 }, { -51, 919, 16227, -711 },
+ { -45, 859, 16245, -675 }, { -40, 800, 16262, -638 }, { -35, 743, 16277, -601 }, { -30, 684, 16292, -562 },
+ { -26, 628, 16305, -523 }, { -22, 572, 16318, -484 }, { -18, 516, 16329, -443 }, { -15, 462, 16339, -402 },
+ { -12, 407, 16349, -360 }, { -9, 354, 16357, -318 }, { -7, 302, 16364, -275 }, { -5, 250, 16370, -231 },
+ { -3, 198, 16375, -186 }, { -2, 148, 16379, -141 }, { -1, 98, 16382, -95 }, { 0, 49, 16383, -48 },
+};
+
+#ifdef _M_IX86
+extern "C" const __declspec(align(16)) sint16 kVDCubicInterpTableFX14_075_MMX[256][8]={
+ { 0, 16384, 0, 16384, 0, 0, 0, 0 }, { -48, 16384, -48, 16384, 48, 0, 48, 0 },
+ { -95, 16383, -95, 16383, 97, -1, 97, -1 }, { -141, 16380, -141, 16380, 147, -2, 147, -2 },
+ { -186, 16375, -186, 16375, 198, -3, 198, -3 }, { -231, 16371, -231, 16371, 249, -5, 249, -5 },
+ { -275, 16365, -275, 16365, 301, -7, 301, -7 }, { -318, 16357, -318, 16357, 354, -9, 354, -9 },
+ { -360, 16349, -360, 16349, 407, -12, 407, -12 }, { -402, 16340, -402, 16340, 461, -15, 461, -15 },
+ { -443, 16329, -443, 16329, 516, -18, 516, -18 }, { -484, 16318, -484, 16318, 572, -22, 572, -22 },
+ { -523, 16305, -523, 16305, 628, -26, 628, -26 }, { -562, 16291, -562, 16291, 685, -30, 685, -30 },
+ { -601, 16278, -601, 16278, 742, -35, 742, -35 }, { -638, 16262, -638, 16262, 800, -40, 800, -40 },
+ { -675, 16245, -675, 16245, 859, -45, 859, -45 }, { -711, 16228, -711, 16228, 918, -51, 918, -51 },
+ { -747, 16209, -747, 16209, 978, -56, 978, -56 }, { -782, 16190, -782, 16190, 1039, -63, 1039, -63 },
+ { -816, 16169, -816, 16169, 1100, -69, 1100, -69 }, { -849, 16147, -849, 16147, 1162, -76, 1162, -76 },
+ { -882, 16124, -882, 16124, 1225, -83, 1225, -83 }, { -915, 16101, -915, 16101, 1288, -90, 1288, -90 },
+ { -946, 16077, -946, 16077, 1351, -98, 1351, -98 }, { -977, 16052, -977, 16052, 1415, -106, 1415, -106 },
+ { -1007, 16025, -1007, 16025, 1480, -114, 1480, -114 }, { -1037, 15998, -1037, 15998, 1545, -122, 1545, -122 },
+ { -1066, 15970, -1066, 15970, 1611, -131, 1611, -131 }, { -1094, 15940, -1094, 15940, 1678, -140, 1678, -140 },
+ { -1122, 15910, -1122, 15910, 1745, -149, 1745, -149 }, { -1149, 15879, -1149, 15879, 1812, -158, 1812, -158 },
+ { -1176, 15848, -1176, 15848, 1880, -168, 1880, -168 }, { -1202, 15815, -1202, 15815, 1949, -178, 1949, -178 },
+ { -1227, 15781, -1227, 15781, 2018, -188, 2018, -188 }, { -1252, 15747, -1252, 15747, 2087, -198, 2087, -198 },
+ { -1276, 15712, -1276, 15712, 2157, -209, 2157, -209 }, { -1300, 15676, -1300, 15676, 2228, -220, 2228, -220 },
+ { -1323, 15639, -1323, 15639, 2299, -231, 2299, -231 }, { -1345, 15601, -1345, 15601, 2370, -242, 2370, -242 },
+ { -1367, 15562, -1367, 15562, 2442, -253, 2442, -253 }, { -1388, 15523, -1388, 15523, 2514, -265, 2514, -265 },
+ { -1409, 15482, -1409, 15482, 2587, -276, 2587, -276 }, { -1429, 15441, -1429, 15441, 2660, -288, 2660, -288 },
+ { -1448, 15399, -1448, 15399, 2734, -301, 2734, -301 }, { -1467, 15356, -1467, 15356, 2808, -313, 2808, -313 },
+ { -1486, 15312, -1486, 15312, 2883, -325, 2883, -325 }, { -1504, 15268, -1504, 15268, 2958, -338, 2958, -338 },
+ { -1521, 15223, -1521, 15223, 3033, -351, 3033, -351 }, { -1538, 15177, -1538, 15177, 3109, -364, 3109, -364 },
+ { -1554, 15130, -1554, 15130, 3185, -377, 3185, -377 }, { -1570, 15084, -1570, 15084, 3261, -391, 3261, -391 },
+ { -1585, 15035, -1585, 15035, 3338, -404, 3338, -404 }, { -1600, 14986, -1600, 14986, 3416, -418, 3416, -418 },
+ { -1614, 14936, -1614, 14936, 3493, -431, 3493, -431 }, { -1627, 14885, -1627, 14885, 3571, -445, 3571, -445 },
+ { -1641, 14834, -1641, 14834, 3650, -459, 3650, -459 }, { -1653, 14783, -1653, 14783, 3728, -474, 3728, -474 },
+ { -1665, 14730, -1665, 14730, 3807, -488, 3807, -488 }, { -1677, 14676, -1677, 14676, 3887, -502, 3887, -502 },
+ { -1688, 14623, -1688, 14623, 3966, -517, 3966, -517 }, { -1699, 14568, -1699, 14568, 4046, -531, 4046, -531 },
+ { -1709, 14512, -1709, 14512, 4127, -546, 4127, -546 }, { -1719, 14457, -1719, 14457, 4207, -561, 4207, -561 },
+ { -1728, 14400, -1728, 14400, 4288, -576, 4288, -576 }, { -1737, 14343, -1737, 14343, 4369, -591, 4369, -591 },
+ { -1745, 14284, -1745, 14284, 4451, -606, 4451, -606 }, { -1753, 14226, -1753, 14226, 4532, -621, 4532, -621 },
+ { -1760, 14167, -1760, 14167, 4614, -637, 4614, -637 }, { -1767, 14107, -1767, 14107, 4696, -652, 4696, -652 },
+ { -1774, 14047, -1774, 14047, 4779, -668, 4779, -668 }, { -1780, 13986, -1780, 13986, 4861, -683, 4861, -683 },
+ { -1785, 13924, -1785, 13924, 4944, -699, 4944, -699 }, { -1791, 13861, -1791, 13861, 5028, -714, 5028, -714 },
+ { -1795, 13798, -1795, 13798, 5111, -730, 5111, -730 }, { -1800, 13736, -1800, 13736, 5194, -746, 5194, -746 },
+ { -1804, 13671, -1804, 13671, 5278, -761, 5278, -761 }, { -1807, 13606, -1807, 13606, 5362, -777, 5362, -777 },
+ { -1810, 13541, -1810, 13541, 5446, -793, 5446, -793 }, { -1813, 13475, -1813, 13475, 5531, -809, 5531, -809 },
+ { -1815, 13409, -1815, 13409, 5615, -825, 5615, -825 }, { -1817, 13342, -1817, 13342, 5700, -841, 5700, -841 },
+ { -1818, 13275, -1818, 13275, 5784, -857, 5784, -857 }, { -1819, 13207, -1819, 13207, 5869, -873, 5869, -873 },
+ { -1820, 13139, -1820, 13139, 5954, -889, 5954, -889 }, { -1820, 13069, -1820, 13069, 6040, -905, 6040, -905 },
+ { -1820, 13000, -1820, 13000, 6125, -921, 6125, -921 }, { -1820, 12930, -1820, 12930, 6211, -937, 6211, -937 },
+ { -1819, 12860, -1819, 12860, 6296, -953, 6296, -953 }, { -1818, 12789, -1818, 12789, 6382, -969, 6382, -969 },
+ { -1816, 12717, -1816, 12717, 6468, -985, 6468, -985 }, { -1815, 12647, -1815, 12647, 6553, -1001, 6553, -1001 },
+ { -1812, 12574, -1812, 12574, 6639, -1017, 6639, -1017 }, { -1810, 12502, -1810, 12502, 6725, -1033, 6725, -1033 },
+ { -1807, 12427, -1807, 12427, 6812, -1048, 6812, -1048 }, { -1804, 12354, -1804, 12354, 6898, -1064, 6898, -1064 },
+ { -1800, 12280, -1800, 12280, 6984, -1080, 6984, -1080 }, { -1796, 12206, -1796, 12206, 7070, -1096, 7070, -1096 },
+ { -1792, 12130, -1792, 12130, 7157, -1111, 7157, -1111 }, { -1787, 12055, -1787, 12055, 7243, -1127, 7243, -1127 },
+ { -1782, 11980, -1782, 11980, 7329, -1143, 7329, -1143 }, { -1777, 11903, -1777, 11903, 7416, -1158, 7416, -1158 },
+ { -1772, 11827, -1772, 11827, 7502, -1173, 7502, -1173 }, { -1766, 11751, -1766, 11751, 7588, -1189, 7588, -1189 },
+ { -1760, 11673, -1760, 11673, 7675, -1204, 7675, -1204 }, { -1753, 11595, -1753, 11595, 7761, -1219, 7761, -1219 },
+ { -1747, 11517, -1747, 11517, 7848, -1234, 7848, -1234 }, { -1740, 11439, -1740, 11439, 7934, -1249, 7934, -1249 },
+ { -1733, 11361, -1733, 11361, 8020, -1264, 8020, -1264 }, { -1725, 11281, -1725, 11281, 8107, -1279, 8107, -1279 },
+ { -1717, 11202, -1717, 11202, 8193, -1294, 8193, -1294 }, { -1709, 11123, -1709, 11123, 8279, -1309, 8279, -1309 },
+ { -1701, 11043, -1701, 11043, 8365, -1323, 8365, -1323 }, { -1692, 10962, -1692, 10962, 8451, -1337, 8451, -1337 },
+ { -1684, 10883, -1684, 10883, 8537, -1352, 8537, -1352 }, { -1675, 10802, -1675, 10802, 8623, -1366, 8623, -1366 },
+ { -1665, 10720, -1665, 10720, 8709, -1380, 8709, -1380 }, { -1656, 10640, -1656, 10640, 8794, -1394, 8794, -1394 },
+ { -1646, 10557, -1646, 10557, 8880, -1407, 8880, -1407 }, { -1636, 10476, -1636, 10476, 8965, -1421, 8965, -1421 },
+ { -1626, 10393, -1626, 10393, 9051, -1434, 9051, -1434 }, { -1615, 10311, -1615, 10311, 9136, -1448, 9136, -1448 },
+ { -1604, 10228, -1604, 10228, 9221, -1461, 9221, -1461 }, { -1594, 10146, -1594, 10146, 9306, -1474, 9306, -1474 },
+ { -1582, 10062, -1582, 10062, 9391, -1487, 9391, -1487 }, { -1571, 9979, -1571, 9979, 9475, -1499, 9475, -1499 },
+ { -1560, 9896, -1560, 9896, 9560, -1512, 9560, -1512 }, { -1548, 9812, -1548, 9812, 9644, -1524, 9644, -1524 },
+ { -1536, 9728, -1536, 9728, 9728, -1536, 9728, -1536 }, { -1524, 9644, -1524, 9644, 9812, -1548, 9812, -1548 },
+ { -1512, 9560, -1512, 9560, 9896, -1560, 9896, -1560 }, { -1499, 9475, -1499, 9475, 9979, -1571, 9979, -1571 },
+ { -1487, 9391, -1487, 9391, 10062, -1582, 10062, -1582 }, { -1474, 9306, -1474, 9306, 10146, -1594, 10146, -1594 },
+ { -1461, 9221, -1461, 9221, 10228, -1604, 10228, -1604 }, { -1448, 9136, -1448, 9136, 10311, -1615, 10311, -1615 },
+ { -1434, 9051, -1434, 9051, 10393, -1626, 10393, -1626 }, { -1421, 8965, -1421, 8965, 10476, -1636, 10476, -1636 },
+ { -1407, 8880, -1407, 8880, 10557, -1646, 10557, -1646 }, { -1394, 8795, -1394, 8795, 10639, -1656, 10639, -1656 },
+ { -1380, 8709, -1380, 8709, 10720, -1665, 10720, -1665 }, { -1366, 8624, -1366, 8624, 10801, -1675, 10801, -1675 },
+ { -1352, 8538, -1352, 8538, 10882, -1684, 10882, -1684 }, { -1337, 8450, -1337, 8450, 10963, -1692, 10963, -1692 },
+ { -1323, 8365, -1323, 8365, 11043, -1701, 11043, -1701 }, { -1309, 8279, -1309, 8279, 11123, -1709, 11123, -1709 },
+ { -1294, 8192, -1294, 8192, 11203, -1717, 11203, -1717 }, { -1279, 8106, -1279, 8106, 11282, -1725, 11282, -1725 },
+ { -1264, 8020, -1264, 8020, 11361, -1733, 11361, -1733 }, { -1249, 7934, -1249, 7934, 11439, -1740, 11439, -1740 },
+ { -1234, 7847, -1234, 7847, 11518, -1747, 11518, -1747 }, { -1219, 7760, -1219, 7760, 11596, -1753, 11596, -1753 },
+ { -1204, 7675, -1204, 7675, 11673, -1760, 11673, -1760 }, { -1189, 7589, -1189, 7589, 11750, -1766, 11750, -1766 },
+ { -1173, 7502, -1173, 7502, 11827, -1772, 11827, -1772 }, { -1158, 7415, -1158, 7415, 11904, -1777, 11904, -1777 },
+ { -1143, 7329, -1143, 7329, 11980, -1782, 11980, -1782 }, { -1127, 7243, -1127, 7243, 12055, -1787, 12055, -1787 },
+ { -1111, 7156, -1111, 7156, 12131, -1792, 12131, -1792 }, { -1096, 7070, -1096, 7070, 12206, -1796, 12206, -1796 },
+ { -1080, 6984, -1080, 6984, 12280, -1800, 12280, -1800 }, { -1064, 6898, -1064, 6898, 12354, -1804, 12354, -1804 },
+ { -1048, 6811, -1048, 6811, 12428, -1807, 12428, -1807 }, { -1033, 6726, -1033, 6726, 12501, -1810, 12501, -1810 },
+ { -1017, 6639, -1017, 6639, 12574, -1812, 12574, -1812 }, { -1001, 6554, -1001, 6554, 12646, -1815, 12646, -1815 },
+ { -985, 6467, -985, 6467, 12718, -1816, 12718, -1816 }, { -969, 6382, -969, 6382, 12789, -1818, 12789, -1818 },
+ { -953, 6296, -953, 6296, 12860, -1819, 12860, -1819 }, { -937, 6211, -937, 6211, 12930, -1820, 12930, -1820 },
+ { -921, 6125, -921, 6125, 13000, -1820, 13000, -1820 }, { -905, 6039, -905, 6039, 13070, -1820, 13070, -1820 },
+ { -889, 5954, -889, 5954, 13139, -1820, 13139, -1820 }, { -873, 5869, -873, 5869, 13207, -1819, 13207, -1819 },
+ { -857, 5784, -857, 5784, 13275, -1818, 13275, -1818 }, { -841, 5700, -841, 5700, 13342, -1817, 13342, -1817 },
+ { -825, 5615, -825, 5615, 13409, -1815, 13409, -1815 }, { -809, 5531, -809, 5531, 13475, -1813, 13475, -1813 },
+ { -793, 5446, -793, 5446, 13541, -1810, 13541, -1810 }, { -777, 5362, -777, 5362, 13606, -1807, 13606, -1807 },
+ { -761, 5278, -761, 5278, 13671, -1804, 13671, -1804 }, { -746, 5195, -746, 5195, 13735, -1800, 13735, -1800 },
+ { -730, 5111, -730, 5111, 13798, -1795, 13798, -1795 }, { -714, 5028, -714, 5028, 13861, -1791, 13861, -1791 },
+ { -699, 4944, -699, 4944, 13924, -1785, 13924, -1785 }, { -683, 4862, -683, 4862, 13985, -1780, 13985, -1780 },
+ { -668, 4780, -668, 4780, 14046, -1774, 14046, -1774 }, { -652, 4696, -652, 4696, 14107, -1767, 14107, -1767 },
+ { -637, 4614, -637, 4614, 14167, -1760, 14167, -1760 }, { -621, 4532, -621, 4532, 14226, -1753, 14226, -1753 },
+ { -606, 4450, -606, 4450, 14285, -1745, 14285, -1745 }, { -591, 4369, -591, 4369, 14343, -1737, 14343, -1737 },
+ { -576, 4288, -576, 4288, 14400, -1728, 14400, -1728 }, { -561, 4207, -561, 4207, 14457, -1719, 14457, -1719 },
+ { -546, 4126, -546, 4126, 14513, -1709, 14513, -1709 }, { -531, 4046, -531, 4046, 14568, -1699, 14568, -1699 },
+ { -517, 3966, -517, 3966, 14623, -1688, 14623, -1688 }, { -502, 3886, -502, 3886, 14677, -1677, 14677, -1677 },
+ { -488, 3807, -488, 3807, 14730, -1665, 14730, -1665 }, { -474, 3728, -474, 3728, 14783, -1653, 14783, -1653 },
+ { -459, 3650, -459, 3650, 14834, -1641, 14834, -1641 }, { -445, 3570, -445, 3570, 14886, -1627, 14886, -1627 },
+ { -431, 3493, -431, 3493, 14936, -1614, 14936, -1614 }, { -418, 3416, -418, 3416, 14986, -1600, 14986, -1600 },
+ { -404, 3338, -404, 3338, 15035, -1585, 15035, -1585 }, { -391, 3262, -391, 3262, 15083, -1570, 15083, -1570 },
+ { -377, 3185, -377, 3185, 15130, -1554, 15130, -1554 }, { -364, 3109, -364, 3109, 15177, -1538, 15177, -1538 },
+ { -351, 3033, -351, 3033, 15223, -1521, 15223, -1521 }, { -338, 2958, -338, 2958, 15268, -1504, 15268, -1504 },
+ { -325, 2882, -325, 2882, 15313, -1486, 15313, -1486 }, { -313, 2808, -313, 2808, 15356, -1467, 15356, -1467 },
+ { -301, 2734, -301, 2734, 15399, -1448, 15399, -1448 }, { -288, 2660, -288, 2660, 15441, -1429, 15441, -1429 },
+ { -276, 2587, -276, 2587, 15482, -1409, 15482, -1409 }, { -265, 2514, -265, 2514, 15523, -1388, 15523, -1388 },
+ { -253, 2442, -253, 2442, 15562, -1367, 15562, -1367 }, { -242, 2370, -242, 2370, 15601, -1345, 15601, -1345 },
+ { -231, 2299, -231, 2299, 15639, -1323, 15639, -1323 }, { -220, 2228, -220, 2228, 15676, -1300, 15676, -1300 },
+ { -209, 2157, -209, 2157, 15712, -1276, 15712, -1276 }, { -198, 2087, -198, 2087, 15747, -1252, 15747, -1252 },
+ { -188, 2017, -188, 2017, 15782, -1227, 15782, -1227 }, { -178, 1949, -178, 1949, 15815, -1202, 15815, -1202 },
+ { -168, 1880, -168, 1880, 15848, -1176, 15848, -1176 }, { -158, 1811, -158, 1811, 15880, -1149, 15880, -1149 },
+ { -149, 1744, -149, 1744, 15911, -1122, 15911, -1122 }, { -140, 1677, -140, 1677, 15941, -1094, 15941, -1094 },
+ { -131, 1611, -131, 1611, 15970, -1066, 15970, -1066 }, { -122, 1545, -122, 1545, 15998, -1037, 15998, -1037 },
+ { -114, 1480, -114, 1480, 16025, -1007, 16025, -1007 }, { -106, 1415, -106, 1415, 16052, -977, 16052, -977 },
+ { -98, 1351, -98, 1351, 16077, -946, 16077, -946 }, { -90, 1288, -90, 1288, 16101, -915, 16101, -915 },
+ { -83, 1224, -83, 1224, 16125, -882, 16125, -882 }, { -76, 1162, -76, 1162, 16147, -849, 16147, -849 },
+ { -69, 1100, -69, 1100, 16169, -816, 16169, -816 }, { -63, 1040, -63, 1040, 16189, -782, 16189, -782 },
+ { -56, 978, -56, 978, 16209, -747, 16209, -747 }, { -51, 919, -51, 919, 16227, -711, 16227, -711 },
+ { -45, 859, -45, 859, 16245, -675, 16245, -675 }, { -40, 800, -40, 800, 16262, -638, 16262, -638 },
+ { -35, 743, -35, 743, 16277, -601, 16277, -601 }, { -30, 684, -30, 684, 16292, -562, 16292, -562 },
+ { -26, 628, -26, 628, 16305, -523, 16305, -523 }, { -22, 572, -22, 572, 16318, -484, 16318, -484 },
+ { -18, 516, -18, 516, 16329, -443, 16329, -443 }, { -15, 462, -15, 462, 16339, -402, 16339, -402 },
+ { -12, 407, -12, 407, 16349, -360, 16349, -360 }, { -9, 354, -9, 354, 16357, -318, 16357, -318 },
+ { -7, 302, -7, 302, 16364, -275, 16364, -275 }, { -5, 250, -5, 250, 16370, -231, 16370, -231 },
+ { -3, 198, -3, 198, 16375, -186, 16375, -186 }, { -2, 148, -2, 148, 16379, -141, 16379, -141 },
+ { -1, 98, -1, 98, 16382, -95, 16382, -95 }, { 0, 49, 0, 49, 16383, -48, 16383, -48 },
+};
+
+#endif
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp b/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp
new file mode 100644
index 000000000..8fe16138a
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/triblt.cpp
@@ -0,0 +1,1717 @@
+// VirtualDub - Video processing and capture application
+// Graphics support library
+// Copyright (C) 1998-2008 Avery Lee
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#include <math.h>
+#include <vector>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/pixmapops.h>
+#include <vd2/Kasumi/resample.h>
+#include <vd2/Kasumi/tables.h>
+#include <vd2/Kasumi/triblt.h>
+
+namespace {
+ uint32 lerp_RGB888(sint32 a, sint32 b, sint32 x) {
+ sint32 a_rb = a & 0xff00ff;
+ sint32 a_g = a & 0x00ff00;
+ sint32 b_rb = b & 0xff00ff;
+ sint32 b_g = b & 0x00ff00;
+
+ const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 top_g = (a_g + (((b_g - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+ return top_rb + top_g;
+ }
+
+ uint32 bilerp_RGB888(sint32 a, sint32 b, sint32 c, sint32 d, sint32 x, sint32 y) {
+ sint32 a_rb = a & 0xff00ff;
+ sint32 a_g = a & 0x00ff00;
+ sint32 b_rb = b & 0xff00ff;
+ sint32 b_g = b & 0x00ff00;
+ sint32 c_rb = c & 0xff00ff;
+ sint32 c_g = c & 0x00ff00;
+ sint32 d_rb = d & 0xff00ff;
+ sint32 d_g = d & 0x00ff00;
+
+ const uint32 top_rb = (a_rb + (((b_rb - a_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 top_g = (a_g + (((b_g - a_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+ const uint32 bot_rb = (c_rb + (((d_rb - c_rb)*x + 0x00800080) >> 8)) & 0xff00ff;
+ const uint32 bot_g = (c_g + (((d_g - c_g )*x + 0x00008000) >> 8)) & 0x00ff00;
+
+ const uint32 final_rb = (top_rb + (((bot_rb - top_rb)*y) >> 8)) & 0xff00ff;
+ const uint32 final_g = (top_g + (((bot_g - top_g )*y) >> 8)) & 0x00ff00;
+
+ return final_rb + final_g;
+ }
+
+ uint32 bicubic_RGB888(const uint32 *src0, const uint32 *src1, const uint32 *src2, const uint32 *src3, sint32 x, sint32 y) {
+ const uint32 p00 = src0[0];
+ const uint32 p01 = src0[1];
+ const uint32 p02 = src0[2];
+ const uint32 p03 = src0[3];
+ const uint32 p10 = src1[0];
+ const uint32 p11 = src1[1];
+ const uint32 p12 = src1[2];
+ const uint32 p13 = src1[3];
+ const uint32 p20 = src2[0];
+ const uint32 p21 = src2[1];
+ const uint32 p22 = src2[2];
+ const uint32 p23 = src2[3];
+ const uint32 p30 = src3[0];
+ const uint32 p31 = src3[1];
+ const uint32 p32 = src3[2];
+ const uint32 p33 = src3[3];
+
+ const sint32 *htab = kVDCubicInterpTableFX14_075[x];
+ const sint32 *vtab = kVDCubicInterpTableFX14_075[y];
+
+ const int ch0 = htab[0];
+ const int ch1 = htab[1];
+ const int ch2 = htab[2];
+ const int ch3 = htab[3];
+ const int cv0 = vtab[0];
+ const int cv1 = vtab[1];
+ const int cv2 = vtab[2];
+ const int cv3 = vtab[3];
+
+ int r0 = ((int)((p00>>16)&0xff) * ch0 + (int)((p01>>16)&0xff) * ch1 + (int)((p02>>16)&0xff) * ch2 + (int)((p03>>16)&0xff) * ch3 + 128) >> 8;
+ int g0 = ((int)((p00>> 8)&0xff) * ch0 + (int)((p01>> 8)&0xff) * ch1 + (int)((p02>> 8)&0xff) * ch2 + (int)((p03>> 8)&0xff) * ch3 + 128) >> 8;
+ int b0 = ((int)((p00 )&0xff) * ch0 + (int)((p01 )&0xff) * ch1 + (int)((p02 )&0xff) * ch2 + (int)((p03 )&0xff) * ch3 + 128) >> 8;
+ int r1 = ((int)((p10>>16)&0xff) * ch0 + (int)((p11>>16)&0xff) * ch1 + (int)((p12>>16)&0xff) * ch2 + (int)((p13>>16)&0xff) * ch3 + 128) >> 8;
+ int g1 = ((int)((p10>> 8)&0xff) * ch0 + (int)((p11>> 8)&0xff) * ch1 + (int)((p12>> 8)&0xff) * ch2 + (int)((p13>> 8)&0xff) * ch3 + 128) >> 8;
+ int b1 = ((int)((p10 )&0xff) * ch0 + (int)((p11 )&0xff) * ch1 + (int)((p12 )&0xff) * ch2 + (int)((p13 )&0xff) * ch3 + 128) >> 8;
+ int r2 = ((int)((p20>>16)&0xff) * ch0 + (int)((p21>>16)&0xff) * ch1 + (int)((p22>>16)&0xff) * ch2 + (int)((p23>>16)&0xff) * ch3 + 128) >> 8;
+ int g2 = ((int)((p20>> 8)&0xff) * ch0 + (int)((p21>> 8)&0xff) * ch1 + (int)((p22>> 8)&0xff) * ch2 + (int)((p23>> 8)&0xff) * ch3 + 128) >> 8;
+ int b2 = ((int)((p20 )&0xff) * ch0 + (int)((p21 )&0xff) * ch1 + (int)((p22 )&0xff) * ch2 + (int)((p23 )&0xff) * ch3 + 128) >> 8;
+ int r3 = ((int)((p30>>16)&0xff) * ch0 + (int)((p31>>16)&0xff) * ch1 + (int)((p32>>16)&0xff) * ch2 + (int)((p33>>16)&0xff) * ch3 + 128) >> 8;
+ int g3 = ((int)((p30>> 8)&0xff) * ch0 + (int)((p31>> 8)&0xff) * ch1 + (int)((p32>> 8)&0xff) * ch2 + (int)((p33>> 8)&0xff) * ch3 + 128) >> 8;
+ int b3 = ((int)((p30 )&0xff) * ch0 + (int)((p31 )&0xff) * ch1 + (int)((p32 )&0xff) * ch2 + (int)((p33 )&0xff) * ch3 + 128) >> 8;
+
+ int r = (r0 * cv0 + r1 * cv1 + r2 * cv2 + r3 * cv3 + (1<<19)) >> 20;
+ int g = (g0 * cv0 + g1 * cv1 + g2 * cv2 + g3 * cv3 + (1<<19)) >> 20;
+ int b = (b0 * cv0 + b1 * cv1 + b2 * cv2 + b3 * cv3 + (1<<19)) >> 20;
+
+ if (r<0) r=0; else if (r>255) r=255;
+ if (g<0) g=0; else if (g>255) g=255;
+ if (b<0) b=0; else if (b>255) b=255;
+
+ return (r<<16) + (g<<8) + b;
+ }
+}
+
+namespace {
+ enum {
+ kTop = 1,
+ kBottom = 2,
+ kLeft = 4,
+ kRight = 8,
+ kNear = 16,
+ kFar = 32
+ };
+
+ struct VDTriBltMipInfo {
+ const uint32 *mip;
+ ptrdiff_t pitch;
+ uint32 uvmul, _pad;
+ };
+
+ struct VDTriBltInfo {
+ VDTriBltMipInfo mips[16];
+ uint32 *dst;
+ const uint32 *src;
+ sint32 width;
+ const int *cubictab;
+ };
+
+ struct VDTriBltGenInfo {
+ float u;
+ float v;
+ float rhw;
+ float dudx;
+ float dvdx;
+ float drhwdx;
+ };
+
+ typedef void (*VDTriBltSpanFunction)(const VDTriBltInfo *);
+ typedef void (*VDTriBltGenFunction)(const VDTriBltGenInfo *);
+
+ void vd_triblt_span_point(const VDTriBltInfo *pInfo) {
+ sint32 w = -pInfo->width;
+ uint32 *dst = pInfo->dst + pInfo->width;
+ const uint32 *src = pInfo->src;
+ const uint32 *texture = pInfo->mips[0].mip;
+ const ptrdiff_t texpitch = pInfo->mips[0].pitch;
+
+ do {
+ dst[w] = vdptroffset(texture, texpitch * src[1])[src[0]];
+ src += 2;
+ } while(++w);
+ }
+
+ void vd_triblt_span_bilinear(const VDTriBltInfo *pInfo) {
+ sint32 w = -pInfo->width;
+ uint32 *dst = pInfo->dst + pInfo->width;
+ const uint32 *src = pInfo->src;
+ const uint32 *texture = pInfo->mips[0].mip;
+ const ptrdiff_t texpitch = pInfo->mips[0].pitch;
+
+ do {
+ const sint32 u = src[0];
+ const sint32 v = src[1];
+ src += 2;
+ const uint32 *src1 = vdptroffset(texture, texpitch * (v>>8)) + (u>>8);
+ const uint32 *src2 = vdptroffset(src1, texpitch);
+
+ dst[w] = bilerp_RGB888(src1[0], src1[1], src2[0], src2[1], u&255, v&255);
+ } while(++w);
+ }
+
+ void vd_triblt_span_trilinear(const VDTriBltInfo *pInfo) {
+ sint32 w = -pInfo->width;
+ uint32 *dst = pInfo->dst + pInfo->width;
+ const uint32 *src = pInfo->src;
+
+ do {
+ sint32 u = src[0];
+ sint32 v = src[1];
+ const sint32 lambda = src[2];
+ src += 3;
+
+ const sint32 lod = lambda >> 8;
+
+ const uint32 *texture1 = pInfo->mips[lod].mip;
+ const ptrdiff_t texpitch1 = pInfo->mips[lod].pitch;
+ const uint32 *texture2 = pInfo->mips[lod+1].mip;
+ const ptrdiff_t texpitch2 = pInfo->mips[lod+1].pitch;
+
+ u >>= lod;
+ v >>= lod;
+
+ u += 128;
+ v += 128;
+
+ const uint32 *src1 = vdptroffset(texture1, texpitch1 * (v>>8)) + (u>>8);
+ const uint32 *src2 = vdptroffset(src1, texpitch1);
+ const uint32 p1 = bilerp_RGB888(src1[0], src1[1], src2[0], src2[1], u&255, v&255);
+
+ u += 128;
+ v += 128;
+
+ const uint32 *src3 = vdptroffset(texture2, texpitch2 * (v>>9)) + (u>>9);
+ const uint32 *src4 = vdptroffset(src3, texpitch2);
+ const uint32 p2 = bilerp_RGB888(src3[0], src3[1], src4[0], src4[1], (u>>1)&255, (v>>1)&255);
+
+ dst[w] = lerp_RGB888(p1, p2, lambda & 255);
+ } while(++w);
+ }
+
+ void vd_triblt_span_bicubic_mip_linear(const VDTriBltInfo *pInfo) {
+ sint32 w = -pInfo->width;
+ uint32 *dst = pInfo->dst + pInfo->width;
+ const uint32 *src = pInfo->src;
+
+ do {
+ sint32 u = src[0];
+ sint32 v = src[1];
+ const sint32 lambda = src[2];
+ src += 3;
+
+ const sint32 lod = lambda >> 8;
+
+ const uint32 *texture1 = pInfo->mips[lod].mip;
+ const ptrdiff_t texpitch1 = pInfo->mips[lod].pitch;
+ const uint32 *texture2 = pInfo->mips[lod+1].mip;
+ const ptrdiff_t texpitch2 = pInfo->mips[lod+1].pitch;
+
+ u >>= lod;
+ v >>= lod;
+
+ u += 128;
+ v += 128;
+
+ const uint32 *src1 = vdptroffset(texture1, texpitch1 * (v>>8)) + (u>>8);
+ const uint32 *src2 = vdptroffset(src1, texpitch1);
+ const uint32 *src3 = vdptroffset(src2, texpitch1);
+ const uint32 *src4 = vdptroffset(src3, texpitch1);
+ const uint32 p1 = bicubic_RGB888(src1, src2, src3, src4, u&255, v&255);
+
+ u += 128;
+ v += 128;
+
+ const uint32 *src5 = vdptroffset(texture2, texpitch2 * (v>>9)) + (u>>9);
+ const uint32 *src6 = vdptroffset(src5, texpitch2);
+ const uint32 *src7 = vdptroffset(src6, texpitch2);
+ const uint32 *src8 = vdptroffset(src7, texpitch2);
+ const uint32 p2 = bicubic_RGB888(src5, src6, src7, src8, (u>>1)&255, (v>>1)&255);
+
+ dst[w] = lerp_RGB888(p1, p2, lambda & 255);
+ } while(++w);
+ }
+
+#ifdef _M_IX86
+ extern "C" void vdasm_triblt_span_bilinear_mmx(const VDTriBltInfo *pInfo);
+ extern "C" void vdasm_triblt_span_trilinear_mmx(const VDTriBltInfo *pInfo);
+ extern "C" void vdasm_triblt_span_bicubic_mip_linear_mmx(const VDTriBltInfo *pInfo);
+ extern "C" void vdasm_triblt_span_bicubic_mip_linear_sse2(const VDTriBltInfo *pInfo);
+ extern "C" void vdasm_triblt_span_point(const VDTriBltInfo *pInfo);
+#endif
+
+ struct VDTriBltTransformedVertex {
+ float x, y, z;
+ union {
+ float w;
+ float rhw;
+ };
+ float r, g, b, a;
+ float u, v;
+ int outcode;
+
+ void interp(const VDTriBltTransformedVertex *v1, const VDTriBltTransformedVertex *v2, float alpha) {
+ x = v1->x + alpha * (v2->x - v1->x);
+ y = v1->y + alpha * (v2->y - v1->y);
+ z = v1->z + alpha * (v2->z - v1->z);
+ w = v1->w + alpha * (v2->w - v1->w);
+
+ r = v1->r + alpha * (v2->r - v1->r);
+ g = v1->g + alpha * (v2->g - v1->g);
+ b = v1->b + alpha * (v2->b - v1->b);
+ a = v1->a + alpha * (v2->a - v1->a);
+
+ u = v1->u + alpha * (v2->u - v1->u);
+ v = v1->v + alpha * (v2->v - v1->v);
+
+ outcode = (x < -w ? kLeft : 0)
+ + (x > +w ? kRight : 0)
+ + (y < -w ? kTop : 0)
+ + (y > +w ? kBottom : 0)
+ + (z < -w ? kNear : 0)
+ + (z > +w ? kFar : 0);
+ }
+ };
+
+ void TransformVerts(VDTriBltTransformedVertex *dst, const VDTriBltVertex *src, int nVerts, const float xform[16]) {
+ const float xflocal[16]={
+ xform[ 0], xform[ 1], xform[ 2], xform[ 3],
+ xform[ 4], xform[ 5], xform[ 6], xform[ 7],
+ xform[ 8], xform[ 9], xform[10], xform[11],
+ xform[12], xform[13], xform[14], xform[15],
+ };
+
+ if (nVerts <= 0)
+ return;
+
+ do {
+ const float x0 = src->x;
+ const float y0 = src->y;
+ const float z0 = src->z;
+
+ const float w = x0*xflocal[12] + y0*xflocal[13] + z0*xflocal[14] + xflocal[15];
+ const float x = x0*xflocal[ 0] + y0*xflocal[ 1] + z0*xflocal[ 2] + xflocal[ 3];
+ const float y = x0*xflocal[ 4] + y0*xflocal[ 5] + z0*xflocal[ 6] + xflocal[ 7];
+ const float z = x0*xflocal[ 8] + y0*xflocal[ 9] + z0*xflocal[10] + xflocal[11];
+
+ int outcode = 0;
+
+ if (x < -w) outcode += kLeft;
+ if (x > w) outcode += kRight;
+ if (y < -w) outcode += kTop;
+ if (y > w) outcode += kBottom;
+ if (z < -w) outcode += kNear;
+ if (z > w) outcode += kFar;
+
+ dst->x = x;
+ dst->y = y;
+ dst->z = z;
+ dst->w = w;
+ dst->u = src->u;
+ dst->v = src->v;
+ dst->r = 1.0f;
+ dst->g = 1.0f;
+ dst->b = 1.0f;
+ dst->a = 1.0f;
+ dst->outcode = outcode;
+
+ ++src;
+ ++dst;
+ } while(--nVerts);
+ }
+
+ void TransformVerts(VDTriBltTransformedVertex *dst, const VDTriColorVertex *src, int nVerts, const float xform[16]) {
+ const float xflocal[16]={
+ xform[ 0], xform[ 1], xform[ 2], xform[ 3],
+ xform[ 4], xform[ 5], xform[ 6], xform[ 7],
+ xform[ 8], xform[ 9], xform[10], xform[11],
+ xform[12], xform[13], xform[14], xform[15],
+ };
+
+ if (nVerts <= 0)
+ return;
+
+ do {
+ const float x0 = src->x;
+ const float y0 = src->y;
+ const float z0 = src->z;
+
+ const float w = x0*xflocal[12] + y0*xflocal[13] + z0*xflocal[14] + xflocal[15];
+ const float x = x0*xflocal[ 0] + y0*xflocal[ 1] + z0*xflocal[ 2] + xflocal[ 3];
+ const float y = x0*xflocal[ 4] + y0*xflocal[ 5] + z0*xflocal[ 6] + xflocal[ 7];
+ const float z = x0*xflocal[ 8] + y0*xflocal[ 9] + z0*xflocal[10] + xflocal[11];
+
+ int outcode = 0;
+
+ if (x < -w) outcode += kLeft;
+ if (x > w) outcode += kRight;
+ if (y < -w) outcode += kTop;
+ if (y > w) outcode += kBottom;
+ if (z < -w) outcode += kNear;
+ if (z > w) outcode += kFar;
+
+ dst->x = x;
+ dst->y = y;
+ dst->z = z;
+ dst->w = w;
+ dst->u = 0.0f;
+ dst->v = 0.0f;
+ dst->r = src->r;
+ dst->g = src->g;
+ dst->b = src->b;
+ dst->a = src->a;
+ dst->outcode = outcode;
+
+ ++src;
+ ++dst;
+ } while(--nVerts);
+ }
+
+ struct VDTriangleSetupInfo {
+ const VDTriBltTransformedVertex *pt, *pr, *pl;
+ VDTriBltTransformedVertex tmp0, tmp1, tmp2;
+ };
+
+ void SetupTri(
+ VDTriangleSetupInfo& setup,
+ VDPixmap& dst,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2,
+ const VDTriBltFilterMode *filterMode
+ )
+ {
+ setup.tmp0 = *vx0;
+ setup.tmp1 = *vx1;
+ setup.tmp2 = *vx2;
+
+ // adjust UVs for filter mode
+ if (filterMode) {
+ switch(*filterMode) {
+ case kTriBltFilterBilinear:
+ setup.tmp0.u += 0.5f;
+ setup.tmp0.v += 0.5f;
+ setup.tmp1.u += 0.5f;
+ setup.tmp1.v += 0.5f;
+ setup.tmp2.u += 0.5f;
+ setup.tmp2.v += 0.5f;
+ case kTriBltFilterTrilinear:
+ case kTriBltFilterBicubicMipLinear:
+ setup.tmp0.u *= 256.0f;
+ setup.tmp0.v *= 256.0f;
+ setup.tmp1.u *= 256.0f;
+ setup.tmp1.v *= 256.0f;
+ setup.tmp2.u *= 256.0f;
+ setup.tmp2.v *= 256.0f;
+ break;
+ case kTriBltFilterPoint:
+ setup.tmp0.u += 1.0f;
+ setup.tmp0.v += 1.0f;
+ setup.tmp1.u += 1.0f;
+ setup.tmp1.v += 1.0f;
+ setup.tmp2.u += 1.0f;
+ setup.tmp2.v += 1.0f;
+ break;
+ }
+ }
+
+ // do perspective divide and NDC space conversion
+ const float xscale = dst.w * 0.5f;
+ const float yscale = dst.h * 0.5f;
+
+ setup.tmp0.rhw = 1.0f / setup.tmp0.w;
+ setup.tmp0.x = (1.0f+setup.tmp0.x*setup.tmp0.rhw)*xscale;
+ setup.tmp0.y = (1.0f+setup.tmp0.y*setup.tmp0.rhw)*yscale;
+ setup.tmp0.u *= setup.tmp0.rhw;
+ setup.tmp0.v *= setup.tmp0.rhw;
+ setup.tmp0.r *= setup.tmp0.rhw;
+ setup.tmp0.g *= setup.tmp0.rhw;
+ setup.tmp0.b *= setup.tmp0.rhw;
+ setup.tmp0.a *= setup.tmp0.rhw;
+ setup.tmp1.rhw = 1.0f / setup.tmp1.w;
+ setup.tmp1.x = (1.0f+setup.tmp1.x*setup.tmp1.rhw)*xscale;
+ setup.tmp1.y = (1.0f+setup.tmp1.y*setup.tmp1.rhw)*yscale;
+ setup.tmp1.u *= setup.tmp1.rhw;
+ setup.tmp1.v *= setup.tmp1.rhw;
+ setup.tmp1.r *= setup.tmp1.rhw;
+ setup.tmp1.g *= setup.tmp1.rhw;
+ setup.tmp1.b *= setup.tmp1.rhw;
+ setup.tmp1.a *= setup.tmp1.rhw;
+ setup.tmp2.rhw = 1.0f / setup.tmp2.w;
+ setup.tmp2.x = (1.0f+setup.tmp2.x*setup.tmp2.rhw)*xscale;
+ setup.tmp2.y = (1.0f+setup.tmp2.y*setup.tmp2.rhw)*yscale;
+ setup.tmp2.u *= setup.tmp2.rhw;
+ setup.tmp2.v *= setup.tmp2.rhw;
+ setup.tmp2.r *= setup.tmp2.rhw;
+ setup.tmp2.g *= setup.tmp2.rhw;
+ setup.tmp2.b *= setup.tmp2.rhw;
+ setup.tmp2.a *= setup.tmp2.rhw;
+
+ // verify clipping
+ VDASSERT(setup.tmp0.x >= 0 && setup.tmp0.x <= dst.w);
+ VDASSERT(setup.tmp1.x >= 0 && setup.tmp1.x <= dst.w);
+ VDASSERT(setup.tmp2.x >= 0 && setup.tmp2.x <= dst.w);
+ VDASSERT(setup.tmp0.y >= 0 && setup.tmp0.y <= dst.h);
+ VDASSERT(setup.tmp1.y >= 0 && setup.tmp1.y <= dst.h);
+ VDASSERT(setup.tmp2.y >= 0 && setup.tmp2.y <= dst.h);
+
+ vx0 = &setup.tmp0;
+ vx1 = &setup.tmp1;
+ vx2 = &setup.tmp2;
+
+ const VDTriBltTransformedVertex *pt, *pl, *pr;
+
+ // sort points
+ if (vx0->y < vx1->y) // 1 < 2
+ if (vx0->y < vx2->y) { // 1 < 2,3
+ pt = vx0;
+ pr = vx1;
+ pl = vx2;
+ } else { // 3 < 1 < 2
+ pt = vx2;
+ pr = vx0;
+ pl = vx1;
+ }
+ else // 2 < 1
+ if (vx1->y < vx2->y) { // 2 < 1,3
+ pt = vx1;
+ pr = vx2;
+ pl = vx0;
+ } else { // 3 < 2 < 1
+ pt = vx2;
+ pr = vx0;
+ pl = vx1;
+ }
+
+ setup.pl = pl;
+ setup.pt = pt;
+ setup.pr = pr;
+ }
+
+ void RenderTri(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2,
+ VDTriBltFilterMode filterMode,
+ float mipMapLODBias)
+ {
+ VDTriangleSetupInfo setup;
+
+ SetupTri(setup, dst, vx0, vx1, vx2, &filterMode);
+
+ const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+
+ const float x10 = pl->x - pt->x;
+ const float x20 = pr->x - pt->x;
+ const float y10 = pl->y - pt->y;
+ const float y20 = pr->y - pt->y;
+ const float A = x20*y10 - x10*y20;
+
+ if (A <= 0.f)
+ return;
+
+ float invA = 0.f;
+ if (A >= 1e-5f)
+ invA = 1.0f / A;
+
+ float x10_A = x10 * invA;
+ float x20_A = x20 * invA;
+ float y10_A = y10 * invA;
+ float y20_A = y20 * invA;
+
+ float u10 = pl->u - pt->u;
+ float u20 = pr->u - pt->u;
+ float v10 = pl->v - pt->v;
+ float v20 = pr->v - pt->v;
+ float rhw10 = pl->rhw - pt->rhw;
+ float rhw20 = pr->rhw - pt->rhw;
+
+ float dudx = u20*y10_A - u10*y20_A;
+ float dudy = u10*x20_A - u20*x10_A;
+ float dvdx = v20*y10_A - v10*y20_A;
+ float dvdy = v10*x20_A - v20*x10_A;
+ float drhwdx = rhw20*y10_A - rhw10*y20_A;
+ float drhwdy = rhw10*x20_A - rhw20*x10_A;
+
+ // Compute edge walking parameters
+
+ float dxl1=0, dxr1=0, dul1=0, dvl1=0, drhwl1=0;
+ float dxl2=0, dxr2=0, dul2=0, dvl2=0, drhwl2=0;
+
+ // Compute left-edge interpolation parameters for first half.
+
+ if (pl->y != pt->y) {
+ dxl1 = (pl->x - pt->x) / (pl->y - pt->y);
+
+ dul1 = dudy + dxl1 * dudx;
+ dvl1 = dvdy + dxl1 * dvdx;
+ drhwl1 = drhwdy + dxl1 * drhwdx;
+ }
+
+ // Compute right-edge interpolation parameters for first half.
+
+ if (pr->y != pt->y) {
+ dxr1 = (pr->x - pt->x) / (pr->y - pt->y);
+ }
+
+ // Compute third-edge interpolation parameters.
+
+ if (pr->y != pl->y) {
+ dxl2 = (pr->x - pl->x) / (pr->y - pl->y);
+
+ dul2 = dudy + dxl2 * dudx;
+ dvl2 = dvdy + dxl2 * dvdx;
+ drhwl2 = drhwdy + dxl2 * drhwdx;
+
+ dxr2 = dxl2;
+ }
+
+ // Initialize parameters for first half.
+ //
+ // We place pixel centers at (x+0.5, y+0.5).
+
+ double xl, xr, ul, vl, rhwl, yf;
+ int y, y1, y2;
+
+ // y_start < y+0.5 to include pixel y.
+
+ y = (int)floor(pt->y + 0.5);
+ yf = (y+0.5) - pt->y;
+
+ xl = pt->x + dxl1 * yf;
+ xr = pt->x + dxr1 * yf;
+ ul = pt->u + dul1 * yf;
+ vl = pt->v + dvl1 * yf;
+ rhwl = pt->rhw + drhwl1 * yf;
+
+ // Initialize parameters for second half.
+
+ double xl2, xr2, ul2, vl2, rhwl2;
+
+ if (pl->y > pr->y) { // Left edge is long side
+ dxl2 = dxl1;
+ dul2 = dul1;
+ dvl2 = dvl1;
+ drhwl2 = drhwl1;
+
+ y1 = (int)floor(pr->y + 0.5);
+ y2 = (int)floor(pl->y + 0.5);
+
+ yf = (y1+0.5) - pr->y;
+
+ // Step left edge.
+
+ xl2 = xl + dxl1 * (y1 - y);
+ ul2 = ul + dul1 * (y1 - y);
+ vl2 = vl + dvl1 * (y1 - y);
+ rhwl2 = rhwl + drhwl1 * (y1 - y);
+
+ // Prestep right edge.
+
+ xr2 = pr->x + dxr2 * yf;
+ } else { // Right edge is long side
+ dxr2 = dxr1;
+
+ y1 = (int)floor(pl->y + 0.5);
+ y2 = (int)floor(pr->y + 0.5);
+
+ yf = (y1+0.5) - pl->y;
+
+ // Prestep left edge.
+
+ xl2 = pl->x + dxl2 * yf;
+ ul2 = pl->u + dul2 * yf;
+ vl2 = pl->v + dvl2 * yf;
+ rhwl2 = pl->rhw + drhwl2 * yf;
+
+ // Step right edge.
+
+ xr2 = xr + dxr1 * (y1 - y);
+ }
+
+ // rasterize
+ const ptrdiff_t dstpitch = dst.pitch;
+ uint32 *dstp = (uint32 *)((char *)dst.data + dstpitch * y);
+
+ VDTriBltInfo texinfo;
+ VDTriBltSpanFunction drawSpan;
+ uint32 cpuflags = CPUGetEnabledExtensions();
+
+ bool triBlt16 = false;
+
+ switch(filterMode) {
+ case kTriBltFilterBicubicMipLinear:
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_SSE2) {
+ drawSpan = vdasm_triblt_span_bicubic_mip_linear_sse2;
+ triBlt16 = true;
+ } else if (cpuflags & CPUF_SUPPORTS_MMX) {
+ drawSpan = vdasm_triblt_span_bicubic_mip_linear_mmx;
+ triBlt16 = true;
+ } else
+#endif
+ drawSpan = vd_triblt_span_bicubic_mip_linear;
+ break;
+ case kTriBltFilterTrilinear:
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ drawSpan = vdasm_triblt_span_trilinear_mmx;
+ triBlt16 = true;
+ } else
+#endif
+ drawSpan = vd_triblt_span_trilinear;
+ break;
+ case kTriBltFilterBilinear:
+#ifdef _M_IX86
+ if (cpuflags & CPUF_SUPPORTS_MMX) {
+ drawSpan = vdasm_triblt_span_bilinear_mmx;
+ triBlt16 = true;
+ } else
+#endif
+ drawSpan = vd_triblt_span_bilinear;
+ break;
+ case kTriBltFilterPoint:
+ drawSpan = vd_triblt_span_point;
+ break;
+ }
+
+ float rhobase = sqrtf(std::max<float>(dudx*dudx + dvdx*dvdx, dudy*dudy + dvdy*dvdy) * (1.0f / 65536.0f)) * powf(2.0f, mipMapLODBias);
+
+ if (triBlt16) {
+ ul *= 256.0f;
+ vl *= 256.0f;
+ ul2 *= 256.0f;
+ vl2 *= 256.0f;
+ dul1 *= 256.0f;
+ dvl1 *= 256.0f;
+ dul2 *= 256.0f;
+ dvl2 *= 256.0f;
+ dudx *= 256.0f;
+ dvdx *= 256.0f;
+ dudy *= 256.0f;
+ dvdy *= 256.0f;
+ }
+
+ int minx1 = (int)floor(std::min<float>(std::min<float>(pl->x, pr->x), pt->x) + 0.5);
+ int maxx2 = (int)floor(std::max<float>(std::max<float>(pl->x, pr->x), pt->x) + 0.5);
+
+ uint32 *const spanptr = new uint32[3 * (maxx2 - minx1)];
+
+ while(y < y2) {
+ if (y == y1) {
+ xl = xl2;
+ xr = xr2;
+ ul = ul2;
+ vl = vl2;
+ rhwl = rhwl2;
+ dxl1 = dxl2;
+ dxr1 = dxr2;
+ dul1 = dul2;
+ dvl1 = dvl2;
+ drhwl1 = drhwl2;
+ }
+
+ int x1, x2;
+ double xf;
+ double u, v, rhw;
+
+ // x_left must be less than (x+0.5) to include pixel x.
+
+ x1 = (int)floor(xl + 0.5);
+ x2 = (int)floor(xr + 0.5);
+ xf = (x1+0.5) - xl;
+
+ u = ul + xf * dudx;
+ v = vl + xf * dvdx;
+ rhw = rhwl + xf * drhwdx;
+
+ int x = x1;
+ uint32 *spanp = spanptr;
+
+ float w = 1.0f / (float)rhw;
+
+ if (x < x2) {
+ if (filterMode >= kTriBltFilterTrilinear) {
+ do {
+ int utexel = VDRoundToIntFastFullRange(u * w);
+ int vtexel = VDRoundToIntFastFullRange(v * w);
+ union{ float f; sint32 i; } rho = {rhobase * w};
+
+ int lambda = ((rho.i - 0x3F800000) >> (23-8));
+ if (lambda < 0)
+ lambda = 0;
+ if (lambda >= (nMipmaps<<8)-256)
+ lambda = (nMipmaps<<8)-257;
+
+ spanp[0] = utexel;
+ spanp[1] = vtexel;
+ spanp[2] = lambda;
+ spanp += 3;
+
+ u += dudx;
+ v += dvdx;
+ rhw += drhwdx;
+
+ w *= (2.0f - w*(float)rhw);
+ } while(++x < x2);
+ } else {
+ do {
+ int utexel = VDFloorToInt(u * w);
+ int vtexel = VDFloorToInt(v * w);
+
+ spanp[0] = utexel;
+ spanp[1] = vtexel;
+ spanp += 2;
+
+ u += dudx;
+ v += dvdx;
+ rhw += drhwdx;
+
+ w *= (2.0f - w*(float)rhw);
+ } while(++x < x2);
+ }
+ }
+
+ for(int i=0; i<nMipmaps; ++i) {
+ texinfo.mips[i].mip = (const uint32 *)pSources[i]->data;
+ texinfo.mips[i].pitch = pSources[i]->pitch;
+ texinfo.mips[i].uvmul = (pSources[i]->pitch << 16) + 4;
+ }
+ texinfo.dst = dstp+x1;
+ texinfo.src = spanptr;
+ texinfo.width = x2-x1;
+
+ if (texinfo.width>0)
+ drawSpan(&texinfo);
+
+ dstp = vdptroffset(dstp, dstpitch);
+ xl += dxl1;
+ xr += dxr1;
+ ul += dul1;
+ vl += dvl1;
+ rhwl += drhwl1;
+
+ ++y;
+ }
+
+ delete[] spanptr;
+ }
+
+ void FillTri(VDPixmap& dst, uint32 c,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2
+ )
+ {
+
+ VDTriangleSetupInfo setup;
+
+ SetupTri(setup, dst, vx0, vx1, vx2, NULL);
+
+ const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+
+ // Compute edge walking parameters
+ float dxl1=0, dxr1=0;
+ float dxl2=0, dxr2=0;
+
+ float x_lt = pl->x - pt->x;
+ float x_rt = pr->x - pt->x;
+ float x_rl = pr->x - pl->x;
+ float y_lt = pl->y - pt->y;
+ float y_rt = pr->y - pt->y;
+ float y_rl = pr->y - pl->y;
+
+ // reject backfaces
+ if (x_lt*y_rt >= x_rt*y_lt)
+ return;
+
+ // Compute left-edge interpolation parameters for first half.
+ if (pl->y != pt->y)
+ dxl1 = x_lt / y_lt;
+
+ // Compute right-edge interpolation parameters for first half.
+ if (pr->y != pt->y)
+ dxr1 = x_rt / y_rt;
+
+ // Compute third-edge interpolation parameters.
+ if (pr->y != pl->y) {
+ dxl2 = x_rl / y_rl;
+
+ dxr2 = dxl2;
+ }
+
+ // Initialize parameters for first half.
+ //
+ // We place pixel centers at (x+0.5, y+0.5).
+
+ double xl, xr, yf;
+ int y, y1, y2;
+
+ // y_start < y+0.5 to include pixel y.
+
+ y = (int)floor(pt->y + 0.5);
+ yf = (y+0.5) - pt->y;
+
+ xl = pt->x + dxl1 * yf;
+ xr = pt->x + dxr1 * yf;
+
+ // Initialize parameters for second half.
+ double xl2, xr2;
+
+ if (pl->y > pr->y) { // Left edge is long side
+ dxl2 = dxl1;
+
+ y1 = (int)floor(pr->y + 0.5);
+ y2 = (int)floor(pl->y + 0.5);
+
+ yf = (y1+0.5) - pr->y;
+
+ // Prestep right edge.
+ xr2 = pr->x + dxr2 * yf;
+
+ // Step left edge.
+ xl2 = xl + dxl1 * (y1 - y);
+ } else { // Right edge is long side
+ dxr2 = dxr1;
+
+ y1 = (int)floor(pl->y + 0.5);
+ y2 = (int)floor(pr->y + 0.5);
+
+ yf = (y1+0.5) - pl->y;
+
+ // Prestep left edge.
+ xl2 = pl->x + dxl2 * yf;
+
+ // Step right edge.
+ xr2 = xr + dxr1 * (y1 - y);
+ }
+
+ // rasterize
+ const ptrdiff_t dstpitch = dst.pitch;
+ uint32 *dstp = (uint32 *)((char *)dst.data + dstpitch * y);
+
+ while(y < y2) {
+ if (y == y1) {
+ xl = xl2;
+ xr = xr2;
+ dxl1 = dxl2;
+ dxr1 = dxr2;
+ }
+
+ int x1, x2;
+ double xf;
+
+ // x_left must be less than (x+0.5) to include pixel x.
+
+ x1 = (int)floor(xl + 0.5);
+ x2 = (int)floor(xr + 0.5);
+ xf = (x1+0.5) - xl;
+
+ while(x1 < x2)
+ dstp[x1++] = c;
+
+ dstp = vdptroffset(dstp, dstpitch);
+ xl += dxl1;
+ xr += dxr1;
+ ++y;
+ }
+ }
+
+ void FillTriGrad(VDPixmap& dst,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2
+ )
+ {
+
+ VDTriangleSetupInfo setup;
+
+ SetupTri(setup, dst, vx0, vx1, vx2, NULL);
+
+ const VDTriBltTransformedVertex *pt = setup.pt, *pl = setup.pl, *pr = setup.pr;
+ const float x10 = pl->x - pt->x;
+ const float x20 = pr->x - pt->x;
+ const float y10 = pl->y - pt->y;
+ const float y20 = pr->y - pt->y;
+ const float A = x20*y10 - x10*y20;
+
+ if (A <= 0.f)
+ return;
+
+ float invA = 0.f;
+ if (A >= 1e-5f)
+ invA = 1.0f / A;
+
+ float x10_A = x10 * invA;
+ float x20_A = x20 * invA;
+ float y10_A = y10 * invA;
+ float y20_A = y20 * invA;
+
+ float r10 = pl->r - pt->r;
+ float r20 = pr->r - pt->r;
+ float g10 = pl->g - pt->g;
+ float g20 = pr->g - pt->g;
+ float b10 = pl->b - pt->b;
+ float b20 = pr->b - pt->b;
+ float a10 = pl->a - pt->a;
+ float a20 = pr->a - pt->a;
+ float rhw10 = pl->rhw - pt->rhw;
+ float rhw20 = pr->rhw - pt->rhw;
+
+ float drdx = r20*y10_A - r10*y20_A;
+ float drdy = r10*x20_A - r20*x10_A;
+ float dgdx = g20*y10_A - g10*y20_A;
+ float dgdy = g10*x20_A - g20*x10_A;
+ float dbdx = b20*y10_A - b10*y20_A;
+ float dbdy = b10*x20_A - b20*x10_A;
+ float dadx = a20*y10_A - a10*y20_A;
+ float dady = a10*x20_A - a20*x10_A;
+ float drhwdx = rhw20*y10_A - rhw10*y20_A;
+ float drhwdy = rhw10*x20_A - rhw20*x10_A;
+
+ // Compute edge walking parameters
+ float dxl1=0;
+ float drl1=0;
+ float dgl1=0;
+ float dbl1=0;
+ float dal1=0;
+ float drhwl1=0;
+ float dxr1=0;
+ float dxl2=0;
+ float drl2=0;
+ float dgl2=0;
+ float dbl2=0;
+ float dal2=0;
+ float drhwl2=0;
+ float dxr2=0;
+
+ float x_lt = pl->x - pt->x;
+ float x_rt = pr->x - pt->x;
+ float x_rl = pr->x - pl->x;
+ float y_lt = pl->y - pt->y;
+ float y_rt = pr->y - pt->y;
+ float y_rl = pr->y - pl->y;
+
+ // Compute left-edge interpolation parameters for first half.
+ if (pl->y != pt->y) {
+ dxl1 = x_lt / y_lt;
+ drl1 = drdy + dxl1 * drdx;
+ dgl1 = dgdy + dxl1 * dgdx;
+ dbl1 = dbdy + dxl1 * dbdx;
+ dal1 = dady + dxl1 * dadx;
+ drhwl1 = drhwdy + dxl1 * drhwdx;
+ }
+
+ // Compute right-edge interpolation parameters for first half.
+ if (pr->y != pt->y)
+ dxr1 = x_rt / y_rt;
+
+ // Compute third-edge interpolation parameters.
+ if (pr->y != pl->y) {
+ dxl2 = x_rl / y_rl;
+
+ drl2 = drdy + dxl2 * drdx;
+ dgl2 = dgdy + dxl2 * dgdx;
+ dbl2 = dbdy + dxl2 * dbdx;
+ dal2 = dady + dxl2 * dadx;
+ drhwl2 = drhwdy + dxl2 * drhwdx;
+
+ dxr2 = dxl2;
+ }
+
+ // Initialize parameters for first half.
+ //
+ // We place pixel centers at (x+0.5, y+0.5).
+
+ double xl, xr, yf;
+ double rl, gl, bl, al, rhwl;
+ double rl2, gl2, bl2, al2, rhwl2;
+ int y, y1, y2;
+
+ // y_start < y+0.5 to include pixel y.
+
+ y = (int)floor(pt->y + 0.5);
+ yf = (y+0.5) - pt->y;
+
+ xl = pt->x + dxl1 * yf;
+ xr = pt->x + dxr1 * yf;
+ rl = pt->r + drl1 * yf;
+ gl = pt->g + dgl1 * yf;
+ bl = pt->b + dbl1 * yf;
+ al = pt->a + dal1 * yf;
+ rhwl = pt->rhw + drhwl1 * yf;
+
+ // Initialize parameters for second half.
+ double xl2, xr2;
+
+ if (pl->y > pr->y) { // Left edge is long side
+ dxl2 = dxl1;
+ drl2 = drl1;
+ dgl2 = dgl1;
+ dbl2 = dbl1;
+ dal2 = dal1;
+ drhwl2 = drhwl1;
+
+ y1 = (int)floor(pr->y + 0.5);
+ y2 = (int)floor(pl->y + 0.5);
+
+ yf = (y1+0.5) - pr->y;
+
+ // Step left edge.
+ xl2 = xl + dxl1 * (y1 - y);
+ rl2 = rl + drl1 * (y1 - y);
+ gl2 = gl + dgl1 * (y1 - y);
+ bl2 = bl + dbl1 * (y1 - y);
+ al2 = al + dal1 * (y1 - y);
+ rhwl2 = rhwl + drhwl1 * (y1 - y);
+
+ // Prestep right edge.
+ xr2 = pr->x + dxr2 * yf;
+ } else { // Right edge is long side
+ dxr2 = dxr1;
+
+ y1 = (int)floor(pl->y + 0.5);
+ y2 = (int)floor(pr->y + 0.5);
+
+ yf = (y1+0.5) - pl->y;
+
+ // Prestep left edge.
+ xl2 = pl->x + dxl2 * yf;
+ rl2 = pl->r + drl2 * yf;
+ gl2 = pl->g + dgl2 * yf;
+ bl2 = pl->b + dbl2 * yf;
+ al2 = pl->a + dal2 * yf;
+ rhwl2 = pl->rhw + drhwl2 * yf;
+
+ // Step right edge.
+ xr2 = xr + dxr2 * (y1 - y);
+ }
+
+ // rasterize
+ const ptrdiff_t dstpitch = dst.pitch;
+ char *dstp0 = (char *)dst.data + dstpitch * y;
+
+ while(y < y2) {
+ if (y == y1) {
+ xl = xl2;
+ xr = xr2;
+ rl = rl2;
+ gl = gl2;
+ bl = bl2;
+ al = al2;
+ rhwl = rhwl2;
+ dxl1 = dxl2;
+ drl1 = drl2;
+ dgl1 = dgl2;
+ dbl1 = dbl2;
+ dal1 = dal2;
+ drhwl1 = drhwl2;
+ dxr1 = dxr2;
+ }
+
+ int x1, x2;
+ double xf;
+ double r, g, b, a, rhw;
+
+ // x_left must be less than (x+0.5) to include pixel x.
+
+ x1 = (int)floor(xl + 0.5);
+ x2 = (int)floor(xr + 0.5);
+ xf = (x1+0.5) - xl;
+
+ r = rl + xf * drdx;
+ g = gl + xf * dgdx;
+ b = bl + xf * dbdx;
+ a = al + xf * dadx;
+ rhw = rhwl + xf * drhwdx;
+
+ float w = 1.0f / (float)rhw;
+
+ if (x1 < x2) {
+ if (dst.format == nsVDPixmap::kPixFormat_XRGB8888) {
+ uint32 *dstp = (uint32 *)dstp0;
+
+ do {
+ float sr = (float)(r * w);
+ float sg = (float)(g * w);
+ float sb = (float)(b * w);
+ float sa = (float)(a * w);
+
+ uint8 ir = VDClampedRoundFixedToUint8Fast(sr);
+ uint8 ig = VDClampedRoundFixedToUint8Fast(sg);
+ uint8 ib = VDClampedRoundFixedToUint8Fast(sb);
+ uint8 ia = VDClampedRoundFixedToUint8Fast(sa);
+
+ dstp[x1] = ((uint32)ia << 24) + ((uint32)ir << 16) + ((uint32)ig << 8) + ib;
+
+ r += drdx;
+ g += dgdx;
+ b += dbdx;
+ a += dadx;
+ rhw += drhwdx;
+
+ w *= (2.0f - w*(float)rhw);
+ } while(++x1 < x2);
+ } else {
+ uint8 *dstp = (uint8 *)dstp0;
+
+ do {
+ float sg = (float)(g * w);
+
+ uint8 ig = VDClampedRoundFixedToUint8Fast(sg);
+
+ dstp[x1] = ig;
+
+ g += dgdx;
+ rhw += drhwdx;
+
+ w *= (2.0f - w*(float)rhw);
+ } while(++x1 < x2);
+ }
+ }
+
+ dstp0 = vdptroffset(dstp0, dstpitch);
+ xl += dxl1;
+ rl += drl1;
+ gl += dgl1;
+ bl += dbl1;
+ al += dal1;
+ rhwl += drhwl1;
+ xr += dxr1;
+ ++y;
+ }
+ }
+
+ struct VDTriClipWorkspace {
+ VDTriBltTransformedVertex *vxheapptr[2][19];
+ VDTriBltTransformedVertex vxheap[21];
+ };
+
+ VDTriBltTransformedVertex **VDClipTriangle(VDTriClipWorkspace& ws,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2,
+ int orflags) {
+ // Each line segment can intersect all six planes, meaning the maximum bound is
+ // 18 vertices. Add 3 for the original.
+
+ VDTriBltTransformedVertex *vxheapnext;
+ VDTriBltTransformedVertex **vxlastheap = ws.vxheapptr[0], **vxnextheap = ws.vxheapptr[1];
+
+ ws.vxheap[0] = *vx0;
+ ws.vxheap[1] = *vx1;
+ ws.vxheap[2] = *vx2;
+
+ vxlastheap[0] = &ws.vxheap[0];
+ vxlastheap[1] = &ws.vxheap[1];
+ vxlastheap[2] = &ws.vxheap[2];
+ vxlastheap[3] = NULL;
+
+ vxheapnext = ws.vxheap + 3;
+
+ // Current Next Action
+ // ------- ---- ------
+ // Unclipped Unclipped Copy vertex
+ // Unclipped Clipped Copy vertex and add intersection
+ // Clipped Unclipped Add intersection
+ // Clipped Clipped No action
+
+#define DOCLIP(cliptype, _sign_, cliparg) \
+ if (orflags & k##cliptype) { \
+ VDTriBltTransformedVertex **src = vxlastheap; \
+ VDTriBltTransformedVertex **dst = vxnextheap; \
+ \
+ while(*src) { \
+ VDTriBltTransformedVertex *cur = *src; \
+ VDTriBltTransformedVertex *next = src[1]; \
+ \
+ if (!next) \
+ next = vxlastheap[0]; \
+ \
+ if (!(cur->outcode & k##cliptype)) \
+ *dst++ = cur; \
+ \
+ if ((cur->outcode ^ next->outcode) & k##cliptype) { \
+ double alpha = (cur->w _sign_ cur->cliparg) / ((cur->w _sign_ cur->cliparg) - (next->w _sign_ next->cliparg)); \
+ \
+ if (alpha >= 0.0 && alpha <= 1.0) { \
+ vxheapnext->interp(cur, next, (float)alpha); \
+ vxheapnext->cliparg = -(_sign_ vxheapnext->w); \
+ *dst++ = vxheapnext++; \
+ } \
+ } \
+ ++src; \
+ } \
+ *dst = NULL; \
+ if (dst < vxnextheap+3) return NULL; \
+ src = vxlastheap; vxlastheap = vxnextheap; vxnextheap = src; \
+ }
+
+
+ DOCLIP(Far, -, z);
+ DOCLIP(Near, +, z);
+ DOCLIP(Bottom, -, y);
+ DOCLIP(Top, +, y);
+ DOCLIP(Right, -, x);
+ DOCLIP(Left, +, x);
+
+#undef DOCLIP
+
+ return vxlastheap;
+ }
+
+ void RenderClippedTri(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+ const VDTriBltTransformedVertex *vx0,
+ const VDTriBltTransformedVertex *vx1,
+ const VDTriBltTransformedVertex *vx2,
+ VDTriBltFilterMode filterMode,
+ float mipMapLODBias,
+ int orflags)
+ {
+
+ VDTriBltTransformedVertex *vxheapnext;
+ VDTriBltTransformedVertex vxheap[21];
+
+ VDTriBltTransformedVertex *vxheapptr[2][19];
+ VDTriBltTransformedVertex **vxlastheap = vxheapptr[0], **vxnextheap = vxheapptr[1];
+
+ vxheap[0] = *vx0;
+ vxheap[1] = *vx1;
+ vxheap[2] = *vx2;
+
+ vxlastheap[0] = &vxheap[0];
+ vxlastheap[1] = &vxheap[1];
+ vxlastheap[2] = &vxheap[2];
+ vxlastheap[3] = NULL;
+
+ vxheapnext = vxheap + 3;
+
+ // Current Next Action
+ // ------- ---- ------
+ // Unclipped Unclipped Copy vertex
+ // Unclipped Clipped Copy vertex and add intersection
+ // Clipped Unclipped Add intersection
+ // Clipped Clipped No action
+
+#define DOCLIP(cliptype, _sign_, cliparg) \
+ if (orflags & k##cliptype) { \
+ VDTriBltTransformedVertex **src = vxlastheap; \
+ VDTriBltTransformedVertex **dst = vxnextheap; \
+ \
+ while(*src) { \
+ VDTriBltTransformedVertex *cur = *src; \
+ VDTriBltTransformedVertex *next = src[1]; \
+ \
+ if (!next) \
+ next = vxlastheap[0]; \
+ \
+ if (!(cur->outcode & k##cliptype)) \
+ *dst++ = cur; \
+ \
+ if ((cur->outcode ^ next->outcode) & k##cliptype) { \
+ double alpha = (cur->w _sign_ cur->cliparg) / ((cur->w _sign_ cur->cliparg) - (next->w _sign_ next->cliparg)); \
+ \
+ if (alpha >= 0.0 && alpha <= 1.0) { \
+ vxheapnext->interp(cur, next, (float)alpha); \
+ vxheapnext->cliparg = -(_sign_ vxheapnext->w); \
+ *dst++ = vxheapnext++; \
+ } \
+ } \
+ ++src; \
+ } \
+ *dst = NULL; \
+ if (dst < vxnextheap+3) return; \
+ src = vxlastheap; vxlastheap = vxnextheap; vxnextheap = src; \
+ }
+
+
+ DOCLIP(Far, -, z);
+ DOCLIP(Near, +, z);
+ DOCLIP(Bottom, -, y);
+ DOCLIP(Top, +, y);
+ DOCLIP(Right, -, x);
+ DOCLIP(Left, +, x);
+
+#undef DOCLIP
+
+ VDTriBltTransformedVertex **src = vxlastheap+1;
+
+ while(src[1]) {
+ RenderTri(dst, pSources, nMipmaps, vxlastheap[0], src[0], src[1], filterMode, mipMapLODBias);
+ ++src;
+ }
+ }
+
+}
+
+bool VDPixmapTriFill(VDPixmap& dst, const uint32 c, const VDTriBltVertex *pVertices, int nVertices, const int *pIndices, int nIndices, const float pTransform[16]) {
+ if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+ return false;
+
+ static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+ vdfastvector<VDTriBltTransformedVertex> xverts(nVertices);
+
+ if (!pTransform)
+ pTransform = xf_ident;
+
+ TransformVerts(xverts.data(), pVertices, nVertices, pTransform);
+
+ const VDTriBltTransformedVertex *xsrc = xverts.data();
+
+ VDTriClipWorkspace clipws;
+
+ while(nIndices >= 3) {
+ const int idx0 = pIndices[0];
+ const int idx1 = pIndices[1];
+ const int idx2 = pIndices[2];
+ const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+ const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+ const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+ const int kode0 = xv0->outcode;
+ const int kode1 = xv1->outcode;
+ const int kode2 = xv2->outcode;
+
+ if (!(kode0 & kode1 & kode2)) {
+ if (int orflags = kode0 | kode1 | kode2) {
+ VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+ if (src) {
+ VDTriBltTransformedVertex *src0 = *src++;
+
+ // fan out triangles
+ while(src[1]) {
+ FillTri(dst, c, src0, src[0], src[1]);
+ ++src;
+ }
+ }
+ } else
+ FillTri(dst, c, xv0, xv1, xv2);
+ }
+
+ pIndices += 3;
+ nIndices -= 3;
+ }
+
+ return true;
+}
+
+bool VDPixmapTriFill(VDPixmap& dst, const VDTriColorVertex *pVertices, int nVertices, const int *pIndices, int nIndices, const float pTransform[16]) {
+ VDPixmap pxY;
+ VDPixmap pxCb;
+ VDPixmap pxCr;
+ bool ycbcr = false;
+ float ycbcr_xoffset = 0;
+
+ switch(dst.format) {
+ case nsVDPixmap::kPixFormat_XRGB8888:
+ case nsVDPixmap::kPixFormat_Y8:
+ break;
+ case nsVDPixmap::kPixFormat_YUV444_Planar:
+ case nsVDPixmap::kPixFormat_YUV422_Planar:
+ case nsVDPixmap::kPixFormat_YUV420_Planar:
+ case nsVDPixmap::kPixFormat_YUV410_Planar:
+ pxY.format = nsVDPixmap::kPixFormat_Y8;
+ pxY.data = dst.data;
+ pxY.pitch = dst.pitch;
+ pxY.w = dst.w;
+ pxY.h = dst.h;
+
+ pxCb.format = nsVDPixmap::kPixFormat_Y8;
+ pxCb.data = dst.data2;
+ pxCb.pitch = dst.pitch2;
+ pxCb.h = dst.h;
+
+ pxCr.format = nsVDPixmap::kPixFormat_Y8;
+ pxCr.data = dst.data3;
+ pxCr.pitch = dst.pitch3;
+ pxCr.h = dst.h;
+
+ if (dst.format == nsVDPixmap::kPixFormat_YUV410_Planar) {
+ pxCr.w = pxCb.w = dst.w >> 2;
+ pxCr.h = pxCb.h = dst.h >> 2;
+ ycbcr_xoffset = 0.75f / (float)pxCr.w;
+ } else if (dst.format == nsVDPixmap::kPixFormat_YUV420_Planar) {
+ pxCr.w = pxCb.w = dst.w >> 1;
+ pxCr.h = pxCb.h = dst.h >> 1;
+ ycbcr_xoffset = 0.5f / (float)pxCr.w;
+ } else if (dst.format == nsVDPixmap::kPixFormat_YUV422_Planar) {
+ pxCr.w = pxCb.w = dst.w >> 1;
+ ycbcr_xoffset = 0.5f / (float)pxCr.w;
+ } else if (dst.format == nsVDPixmap::kPixFormat_YUV444_Planar) {
+ pxCr.w = pxCb.w = dst.w;
+ ycbcr_xoffset = 0.0f;
+ }
+
+ ycbcr = true;
+ break;
+ default:
+ return false;
+ }
+
+ VDTriBltTransformedVertex fastxverts[64];
+ vdfastvector<VDTriBltTransformedVertex> xverts;
+
+ VDTriBltTransformedVertex *xsrc;
+ if (nVertices <= 64) {
+ xsrc = fastxverts;
+ } else {
+ xverts.resize(nVertices);
+ xsrc = xverts.data();
+ }
+
+ static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+ if (!pTransform)
+ pTransform = xf_ident;
+
+ VDTriClipWorkspace clipws;
+ for(int plane=0; plane<(ycbcr?3:1); ++plane) {
+ VDPixmap& pxPlane = ycbcr ? plane == 0 ? pxY : plane == 1 ? pxCb : pxCr : dst;
+
+ if (ycbcr && plane) {
+ float xf_ycbcr[16];
+ memcpy(xf_ycbcr, pTransform, sizeof(float) * 16);
+
+ // translate in x by ycbcr_xoffset
+ xf_ycbcr[0] += xf_ycbcr[12]*ycbcr_xoffset;
+ xf_ycbcr[1] += xf_ycbcr[13]*ycbcr_xoffset;
+ xf_ycbcr[2] += xf_ycbcr[14]*ycbcr_xoffset;
+ xf_ycbcr[3] += xf_ycbcr[15]*ycbcr_xoffset;
+
+ TransformVerts(xsrc, pVertices, nVertices, xf_ycbcr);
+
+ switch(plane) {
+ case 1:
+ for(int i=0; i<nVertices; ++i)
+ xsrc[i].g = xsrc[i].b;
+ break;
+ case 2:
+ for(int i=0; i<nVertices; ++i)
+ xsrc[i].g = xsrc[i].r;
+ break;
+ }
+ } else {
+ TransformVerts(xsrc, pVertices, nVertices, pTransform);
+ }
+
+ const int *nextIndex = pIndices;
+ int indicesLeft = nIndices;
+ while(indicesLeft >= 3) {
+ const int idx0 = nextIndex[0];
+ const int idx1 = nextIndex[1];
+ const int idx2 = nextIndex[2];
+ const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+ const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+ const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+ const int kode0 = xv0->outcode;
+ const int kode1 = xv1->outcode;
+ const int kode2 = xv2->outcode;
+
+ if (!(kode0 & kode1 & kode2)) {
+ if (int orflags = kode0 | kode1 | kode2) {
+ VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+ if (src) {
+ VDTriBltTransformedVertex *src0 = *src++;
+
+ // fan out triangles
+ while(src[1]) {
+ FillTriGrad(pxPlane, src0, src[0], src[1]);
+ ++src;
+ }
+ }
+ } else {
+ FillTriGrad(pxPlane, xv0, xv1, xv2);
+ }
+ }
+
+ nextIndex += 3;
+ indicesLeft -= 3;
+ }
+ }
+
+ return true;
+}
+
+bool VDPixmapTriBlt(VDPixmap& dst, const VDPixmap *const *pSources, int nMipmaps,
+ const VDTriBltVertex *pVertices, int nVertices,
+ const int *pIndices, int nIndices,
+ VDTriBltFilterMode filterMode,
+ float mipMapLODBias,
+ const float pTransform[16])
+{
+ if (dst.format != nsVDPixmap::kPixFormat_XRGB8888)
+ return false;
+
+ static const float xf_ident[16]={1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f};
+ vdfastvector<VDTriBltTransformedVertex> xverts(nVertices);
+
+ if (!pTransform)
+ pTransform = xf_ident;
+
+ TransformVerts(xverts.data(), pVertices, nVertices, pTransform);
+
+ const VDTriBltTransformedVertex *xsrc = xverts.data();
+
+ VDTriClipWorkspace clipws;
+
+ while(nIndices >= 3) {
+ const int idx0 = pIndices[0];
+ const int idx1 = pIndices[1];
+ const int idx2 = pIndices[2];
+ const VDTriBltTransformedVertex *xv0 = &xsrc[idx0];
+ const VDTriBltTransformedVertex *xv1 = &xsrc[idx1];
+ const VDTriBltTransformedVertex *xv2 = &xsrc[idx2];
+ const int kode0 = xv0->outcode;
+ const int kode1 = xv1->outcode;
+ const int kode2 = xv2->outcode;
+
+ if (!(kode0 & kode1 & kode2)) {
+ if (int orflags = kode0 | kode1 | kode2) {
+ VDTriBltTransformedVertex **src = VDClipTriangle(clipws, xv0, xv1, xv2, orflags);
+
+ if (src) {
+ VDTriBltTransformedVertex *src0 = *src++;
+
+ // fan out triangles
+ while(src[1]) {
+ RenderTri(dst, pSources, nMipmaps, src0, src[0], src[1], filterMode, mipMapLODBias);
+ ++src;
+ }
+ }
+ } else
+ RenderTri(dst, pSources, nMipmaps, xv0, xv1, xv2, filterMode, mipMapLODBias);
+ }
+
+ pIndices += 3;
+ nIndices -= 3;
+ }
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+void VDPixmapSetTextureBorders(VDPixmap& px, bool wrap) {
+ const int w = px.w;
+ const int h = px.h;
+
+ VDPixmapBlt(px, 0, 1, px, wrap ? w-2 : 1, 1, 1, h-2);
+ VDPixmapBlt(px, w-1, 1, px, wrap ? 1 : w-2, 1, 1, h-2);
+
+ VDPixmapBlt(px, 0, 0, px, 0, wrap ? h-2 : 1, w, 1);
+ VDPixmapBlt(px, 0, h-1, px, 0, wrap ? 1 : h-2, w, 1);
+}
+
+void VDPixmapSetTextureBordersCubic(VDPixmap& px) {
+ const int w = px.w;
+ const int h = px.h;
+
+ VDPixmapBlt(px, 0, 1, px, 2, 1, 1, h-2);
+ VDPixmapBlt(px, 1, 1, px, 2, 1, 1, h-2);
+ VDPixmapBlt(px, w-2, 1, px, w-3, 1, 1, h-2);
+ VDPixmapBlt(px, w-1, 1, px, w-3, 1, 1, h-2);
+
+ VDPixmapBlt(px, 0, 0, px, 0, 2, w, 1);
+ VDPixmapBlt(px, 0, 1, px, 0, 2, w, 1);
+ VDPixmapBlt(px, 0, h-2, px, 0, h-3, w, 1);
+ VDPixmapBlt(px, 0, h-1, px, 0, h-3, w, 1);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapTextureMipmapChain::VDPixmapTextureMipmapChain(const VDPixmap& src, bool wrap, bool cubic, int maxlevels) {
+ int w = src.w;
+ int h = src.h;
+ int mipcount = 0;
+
+ while((w>1 || h>1) && maxlevels--) {
+ ++mipcount;
+ w >>= 1;
+ h >>= 1;
+ }
+
+ mBuffers.resize(mipcount);
+ mMipMaps.resize(mipcount);
+
+ vdautoptr<IVDPixmapResampler> r(VDCreatePixmapResampler());
+ r->SetFilters(IVDPixmapResampler::kFilterLinear, IVDPixmapResampler::kFilterLinear, false);
+
+ float fw = (float)src.w;
+ float fh = (float)src.h;
+ for(int mip=0; mip<mipcount; ++mip) {
+ const int mipw = VDCeilToInt(fw);
+ const int miph = VDCeilToInt(fh);
+
+ mMipMaps[mip] = &mBuffers[mip];
+
+ if (cubic) {
+ mBuffers[mip].init(mipw+4, miph+4, nsVDPixmap::kPixFormat_XRGB8888);
+
+ if (!mip) {
+ VDPixmapBlt(mBuffers[0], 2, 2, src, 0, 0, src.w, src.h);
+ VDPixmapSetTextureBordersCubic(mBuffers[0]);
+ } else {
+ const VDPixmap& curmip = mBuffers[mip];
+ const VDPixmap& prevmip = mBuffers[mip-1];
+
+ vdrect32f rdst( 0.0f, 0.0f, (float)curmip.w , (float)curmip.h );
+ vdrect32f rsrc(-2.0f, -2.0f, 2.0f*(float)curmip.w - 2.0f, 2.0f*(float)curmip.h - 2.0f);
+ r->Init(rdst, curmip.w, curmip.h, curmip.format, rsrc, prevmip.w, prevmip.h, prevmip.format);
+ r->Process(curmip, prevmip);
+ }
+ } else {
+ mBuffers[mip].init(mipw+2, miph+2, nsVDPixmap::kPixFormat_XRGB8888);
+
+ if (!mip) {
+ VDPixmapBlt(mBuffers[0], 1, 1, src, 0, 0, src.w, src.h);
+ VDPixmapSetTextureBorders(mBuffers[0], wrap);
+ } else {
+ const VDPixmap& curmip = mBuffers[mip];
+ const VDPixmap& prevmip = mBuffers[mip-1];
+
+ vdrect32f rdst( 0.0f, 0.0f, (float)curmip.w , (float)curmip.h );
+ vdrect32f rsrc(-1.0f, -1.0f, 2.0f*(float)curmip.w - 1.0f, 2.0f*(float)curmip.h - 1.0f);
+ r->Init(rdst, curmip.w, curmip.h, curmip.format, rsrc, prevmip.w, prevmip.h, prevmip.format);
+ r->Process(curmip, prevmip);
+ }
+ }
+
+ fw *= 0.5f;
+ fh *= 0.5f;
+ }
+}
+
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp
new file mode 100644
index 000000000..6dc1b4334
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit.cpp
@@ -0,0 +1,903 @@
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "uberblit.h"
+#include "uberblit_gen.h"
+
+uint32 VDPixmapGetFormatTokenFromFormat(int format) {
+ using namespace nsVDPixmap;
+ switch(format) {
+ case kPixFormat_Pal1: return kVDPixType_1 | kVDPixSamp_444 | kVDPixSpace_Pal;
+ case kPixFormat_Pal2: return kVDPixType_2 | kVDPixSamp_444 | kVDPixSpace_Pal;
+ case kPixFormat_Pal4: return kVDPixType_4 | kVDPixSamp_444 | kVDPixSpace_Pal;
+ case kPixFormat_Pal8: return kVDPixType_8 | kVDPixSamp_444 | kVDPixSpace_Pal;
+ case kPixFormat_XRGB1555: return kVDPixType_1555_LE | kVDPixSamp_444 | kVDPixSpace_BGR;
+ case kPixFormat_RGB565: return kVDPixType_565_LE | kVDPixSamp_444 | kVDPixSpace_BGR;
+ case kPixFormat_RGB888: return kVDPixType_888 | kVDPixSamp_444 | kVDPixSpace_BGR;
+ case kPixFormat_XRGB8888: return kVDPixType_8888 | kVDPixSamp_444 | kVDPixSpace_BGR;
+ case kPixFormat_Y8: return kVDPixType_8 | kVDPixSamp_444 | kVDPixSpace_Y_601;
+ case kPixFormat_YUV422_UYVY: return kVDPixType_B8G8_R8G8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_YUYV: return kVDPixType_G8B8_G8R8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV444_XVYU: return kVDPixType_8888 | kVDPixSamp_444 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV444_Planar: return kVDPixType_8_8_8 | kVDPixSamp_444 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_Planar: return kVDPixType_8_8_8 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_Planar_16F: return kVDPixType_16F_16F_16F_LE | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV420_Planar: return kVDPixType_8_8_8 | kVDPixSamp_420_MPEG2 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV411_Planar: return kVDPixType_8_8_8 | kVDPixSamp_411 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV410_Planar: return kVDPixType_8_8_8 | kVDPixSamp_410 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_Planar_Centered: return kVDPixType_8_8_8 | kVDPixSamp_422_JPEG | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV420_Planar_Centered: return kVDPixType_8_8_8 | kVDPixSamp_420_MPEG1 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_V210: return kVDPixType_V210 | kVDPixSamp_422 | kVDPixSpace_YCC_601;
+ case kPixFormat_YUV422_UYVY_709: return kVDPixType_B8G8_R8G8 | kVDPixSamp_422 | kVDPixSpace_YCC_709;
+ case kPixFormat_YUV420_NV12: return kVDPixType_8_B8R8 | kVDPixSamp_420_MPEG2 | kVDPixSpace_YCC_601;
+ default:
+ VDASSERT(false);
+ return 0;
+ }
+}
+
+const VDPixmapSamplingInfo& VDPixmapGetSamplingInfo(uint32 samplingToken) {
+ static const VDPixmapSamplingInfo kPixmapSamplingInfo[]={
+ /* Null */ { 0, 0, 0, 0, 0 },
+ /* 444 */ { 0, 0, 0, 0, 0 },
+ /* 422 */ { -4, 0, 0, 1, 0 },
+ /* 422_JPEG */ { 0, 0, 0, 1, 0 },
+ /* 420_MPEG2 */ { -4, 0, 0, 1, 1 },
+ /* 420_MPEG2INT */ { -4, 0, 0, 1, 1 },
+ /* 420_MPEG1 */ { 0, 0, 0, 1, 1 },
+ /* 420_DVPAL */ { -4, 0, 0, 1, 1 },
+ /* 411 */ { -6, 0, 0, 2, 0 },
+ /* 410 */ { -6, 0, 0, 2, 2 }
+ };
+
+ uint32 index = (samplingToken & kVDPixSamp_Mask) >> kVDPixSamp_Bits;
+
+ return index >= sizeof(kPixmapSamplingInfo)/sizeof(kPixmapSamplingInfo[0]) ? kPixmapSamplingInfo[0] : kPixmapSamplingInfo[index];
+}
+
+namespace {
+ uint32 BlitterConvertSampling(VDPixmapUberBlitterGenerator& gen, uint32 srcToken, uint32 dstSamplingToken, sint32 w, sint32 h) {
+ // if the source type is 16F, we have to convert to 32F
+ if ((srcToken & kVDPixType_Mask) == kVDPixType_16F_16F_16F_LE) {
+ // 0 1 2
+ gen.conv_16F_to_32F();
+ gen.swap(1);
+ // 1 0 2
+ gen.conv_16F_to_32F();
+ gen.swap(2);
+ // 2 0 1
+ gen.conv_16F_to_32F();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+ }
+
+ // look up sampling info
+ const VDPixmapSamplingInfo& srcInfo = VDPixmapGetSamplingInfo(srcToken);
+ const VDPixmapSamplingInfo& dstInfo = VDPixmapGetSamplingInfo(dstSamplingToken);
+
+ // convert destination chroma origin to luma space
+ int c_x = ((8 + dstInfo.mCXOffset16) << dstInfo.mCXBits) - 8;
+ int cr_y = ((8 + dstInfo.mCrYOffset16) << dstInfo.mCYBits) - 8;
+ int cb_y = ((8 + dstInfo.mCbYOffset16) << dstInfo.mCYBits) - 8;
+
+ // convert luma chroma location to source chroma space
+ c_x = ((8 + c_x) >> srcInfo.mCXBits) - 8 - srcInfo.mCXOffset16;
+ cr_y = ((8 + cr_y) >> srcInfo.mCYBits) - 8 - srcInfo.mCrYOffset16;
+ cb_y = ((8 + cb_y) >> srcInfo.mCYBits) - 8 - srcInfo.mCbYOffset16;
+
+ float cxo = c_x / 16.0f + 0.5f;
+ float cxf = ((16 << dstInfo.mCXBits) >> srcInfo.mCXBits) / 16.0f;
+ float cyf = ((16 << dstInfo.mCYBits) >> srcInfo.mCYBits) / 16.0f;
+ sint32 cw = -(-w >> dstInfo.mCXBits);
+ sint32 ch = -(-h >> dstInfo.mCYBits);
+
+ gen.swap(2);
+ gen.linear(cxo, cxf, cw, cb_y / 16.0f + 0.5f, cyf, ch);
+ gen.swap(2);
+ gen.linear(cxo, cxf, cw, cr_y / 16.0f + 0.5f, cyf, ch);
+
+ return (srcToken & ~kVDPixSamp_Mask) | (dstSamplingToken & kVDPixSamp_Mask);
+ }
+
+ uint32 BlitterConvertType(VDPixmapUberBlitterGenerator& gen, uint32 srcToken, uint32 dstToken, sint32 w, sint32 h) {
+ uint32 dstType = dstToken & kVDPixType_Mask;
+
+ while((srcToken ^ dstToken) & kVDPixType_Mask) {
+ uint32 srcType = srcToken & kVDPixType_Mask;
+ uint32 targetType = dstType;
+
+ type_reconvert:
+ switch(targetType) {
+ case kVDPixType_1555_LE:
+ switch(srcType) {
+ case kVDPixType_565_LE:
+ gen.conv_565_to_555();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_1555_LE;
+ break;
+
+ case kVDPixType_8888:
+ gen.conv_8888_to_555();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_1555_LE;
+ break;
+ case kVDPixType_B8G8_R8G8:
+ case kVDPixType_G8B8_G8R8:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+ default:
+ targetType = kVDPixType_8888;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_565_LE:
+ switch(srcType) {
+ case kVDPixType_1555_LE:
+ gen.conv_555_to_565();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_565_LE;
+ break;
+ case kVDPixType_8888:
+ gen.conv_8888_to_565();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_565_LE;
+ break;
+ case kVDPixType_B8G8_R8G8:
+ case kVDPixType_G8B8_G8R8:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+ default:
+ targetType = kVDPixType_8888;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_888:
+ switch(srcType) {
+ case kVDPixType_8888:
+ gen.conv_8888_to_888();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_888;
+ break;
+ default:
+ targetType = kVDPixType_8888;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_8888:
+ switch(srcType) {
+ case kVDPixType_1555_LE:
+ gen.conv_555_to_8888();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+ break;
+ case kVDPixType_565_LE:
+ gen.conv_565_to_8888();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+ break;
+ case kVDPixType_888:
+ gen.conv_888_to_8888();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+ break;
+ case kVDPixType_32Fx4_LE:
+ gen.conv_X32F_to_8888();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+ break;
+ case kVDPixType_8_8_8:
+ if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_444)
+ srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_444, w, h);
+ gen.interleave_X8R8G8B8();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8888;
+ break;
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ case kVDPixType_8:
+ switch(srcType) {
+ case kVDPixType_8_8_8:
+ gen.pop();
+ gen.swap(1);
+ gen.pop();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+ break;
+
+ case kVDPixType_16F_LE:
+ targetType = kVDPixType_32F_LE;
+ goto type_reconvert;
+
+ case kVDPixType_32F_LE:
+ gen.conv_32F_to_8();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+ break;
+
+ default:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_8_8_8:
+ switch(srcType) {
+ case kVDPixType_B8G8_R8G8:
+ gen.dup();
+ gen.dup();
+ gen.extract_8in32(2, (w + 1) >> 1, h);
+ gen.swap(2);
+ gen.extract_8in16(1, w, h);
+ gen.swap(1);
+ gen.extract_8in32(0, (w + 1) >> 1, h);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_8_8_8 | kVDPixSamp_422;
+ break;
+ case kVDPixType_G8B8_G8R8:
+ gen.dup();
+ gen.dup();
+ gen.extract_8in32(3, (w + 1) >> 1, h);
+ gen.swap(2);
+ gen.extract_8in16(0, w, h);
+ gen.swap(1);
+ gen.extract_8in32(1, (w + 1) >> 1, h);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_8_8_8 | kVDPixSamp_422;
+ break;
+ case kVDPixType_16F_16F_16F_LE:
+ case kVDPixType_V210:
+ targetType = kVDPixType_32F_32F_32F_LE;
+ goto type_reconvert;
+ case kVDPixType_32F_32F_32F_LE:
+ // 0 1 2
+ gen.conv_32F_to_8();
+ gen.swap(1);
+ // 1 0 2
+ gen.conv_32F_to_8();
+ gen.swap(2);
+ // 2 0 1
+ gen.conv_32F_to_8();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+ break;
+ case kVDPixType_8_B8R8:
+ {
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+ int cw = -(-w >> sampInfo.mCXBits);
+ int ch = -(-h >> sampInfo.mCYBits);
+
+ gen.dup();
+ gen.extract_8in16(1, cw, ch);
+ gen.swap(2);
+ gen.swap(1);
+ gen.extract_8in16(0, cw, ch);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+ }
+ break;
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ case kVDPixType_B8G8_R8G8:
+ switch(srcType) {
+ case kVDPixType_8_8_8:
+ if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+ srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+
+ gen.interleave_B8G8_R8G8();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_B8G8_R8G8;
+ break;
+ case kVDPixType_G8B8_G8R8:
+ gen.swap_8in16(w, h, w*2);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_B8G8_R8G8;
+ break;
+ default:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_G8B8_G8R8:
+ switch(srcType) {
+ case kVDPixType_8_8_8:
+ if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+ srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+
+ gen.interleave_G8B8_G8R8();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_G8B8_G8R8;
+ break;
+ case kVDPixType_B8G8_R8G8:
+ gen.swap_8in16(w, h, w*2);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSamp_Mask)) | kVDPixType_G8B8_G8R8;
+ break;
+ default:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ switch(srcType) {
+ case kVDPixType_32F_32F_32F_LE:
+ // 0 1 2
+ gen.conv_32F_to_16F();
+ gen.swap(1);
+ // 1 0 2
+ gen.conv_32F_to_16F();
+ gen.swap(2);
+ // 2 0 1
+ gen.conv_32F_to_16F();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_16F_16F_LE;
+ break;
+
+ default:
+ targetType = kVDPixType_32F_32F_32F_LE;
+ goto type_reconvert;
+ }
+ break;
+
+ case kVDPixType_32F_32F_32F_LE:
+ switch(srcType) {
+ case kVDPixType_8_8_8:
+ // 0 1 2
+ gen.conv_8_to_32F();
+ gen.swap(1);
+ // 1 0 2
+ gen.conv_8_to_32F();
+ gen.swap(2);
+ // 2 0 1
+ gen.conv_8_to_32F();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ // 0 1 2
+ gen.conv_16F_to_32F();
+ gen.swap(1);
+ // 1 0 2
+ gen.conv_16F_to_32F();
+ gen.swap(2);
+ // 2 0 1
+ gen.conv_16F_to_32F();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+ break;
+
+ case kVDPixType_B8G8_R8G8:
+ case kVDPixType_G8B8_G8R8:
+ case kVDPixType_8_B8R8:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+
+ case kVDPixType_V210:
+ gen.conv_V210_to_32F();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+ break;
+
+ case kVDPixType_V210:
+ switch(srcType) {
+ case kVDPixType_32F_32F_32F_LE:
+ if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_422)
+ srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_422, w, h);
+
+ gen.conv_32F_to_V210();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_V210;
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ targetType = kVDPixType_32F_32F_32F_LE;
+ goto type_reconvert;
+
+ case kVDPixType_8_8_8:
+ if ((srcToken & kVDPixSamp_Mask) != kVDPixSamp_422)
+ srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_422, w, h);
+
+ targetType = kVDPixType_32F_32F_32F_LE;
+ goto type_reconvert;
+
+ case kVDPixType_B8G8_R8G8:
+ case kVDPixType_G8B8_G8R8:
+ case kVDPixType_8_B8R8:
+ targetType = kVDPixType_8_8_8;
+ goto type_reconvert;
+
+ default:
+ VDASSERT(false);
+ }
+ break;
+
+ case kVDPixType_32F_LE:
+ switch(srcType) {
+ case kVDPixType_8:
+ gen.conv_8_to_32F();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+ break;
+ case kVDPixType_16F_LE:
+ gen.conv_16F_to_32F();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+ break;
+ default:
+ VDASSERT(false);
+ }
+ break;
+
+ case kVDPixType_8_B8R8:
+ switch(srcType) {
+ case kVDPixType_8_8_8:
+ gen.swap(1);
+ gen.swap(2);
+ gen.interleave_B8R8();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_B8R8;
+ break;
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ }
+
+ return srcToken;
+ }
+}
+
+IVDPixmapBlitter *VDPixmapCreateBlitter(const VDPixmap& dst, const VDPixmap& src) {
+ const VDPixmapLayout& dstlayout = VDPixmapToLayoutFromBase(dst, dst.data);
+ const VDPixmapLayout& srclayout = VDPixmapToLayoutFromBase(src, src.data);
+
+ return VDPixmapCreateBlitter(dstlayout, srclayout);
+}
+
+IVDPixmapBlitter *VDPixmapCreateBlitter(const VDPixmapLayout& dst, const VDPixmapLayout& src) {
+ if (src.format == dst.format) {
+ return VDCreatePixmapUberBlitterDirectCopy(dst, src);
+ }
+
+ uint32 srcToken = VDPixmapGetFormatTokenFromFormat(src.format);
+ uint32 dstToken = VDPixmapGetFormatTokenFromFormat(dst.format);
+
+ VDPixmapUberBlitterGenerator gen;
+
+ // load source channels
+ int w = src.w;
+ int h = src.h;
+
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_1:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 7) >> 3);
+ break;
+
+ case kVDPixType_2:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 3) >> 2);
+ break;
+
+ case kVDPixType_4:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, (w + 1) >> 1);
+ break;
+
+ case kVDPixType_8:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+ break;
+
+ case kVDPixType_555_LE:
+ case kVDPixType_565_LE:
+ case kVDPixType_1555_LE:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*2);
+ break;
+
+ case kVDPixType_888:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*3);
+ break;
+
+ case kVDPixType_8888:
+ case kVDPixType_32F_LE:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*4);
+ break;
+
+ case kVDPixType_32Fx4_LE:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*16);
+ break;
+
+ case kVDPixType_B8G8_R8G8:
+ case kVDPixType_G8B8_G8R8:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, ((w + 1) & ~1)*2);
+ break;
+
+ case kVDPixType_8_8_8:
+ {
+ uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+ uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+ uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+ int cxbits = sampInfo.mCXBits;
+ int cybits = sampInfo.mCYBits;
+ int w2 = -(-w >> cxbits);
+ int h2 = -(-h >> cybits);
+ gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2);
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+ gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2);
+ }
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ {
+ uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+ uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+ uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+ int cxbits = sampInfo.mCXBits;
+ int cybits = sampInfo.mCYBits;
+ int w2 = -(-w >> cxbits);
+ int h2 = -(-h >> cybits);
+ gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2 * 2);
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*2);
+ gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2 * 2);
+ }
+ break;
+
+ case kVDPixType_32F_32F_32F_LE:
+ {
+ uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+ uint32 cbtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+ uint32 crtoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+ int cxbits = sampInfo.mCXBits;
+ int cybits = sampInfo.mCYBits;
+ int w2 = -(-w >> cxbits);
+ int h2 = -(-h >> cybits);
+ gen.ldsrc(0, 2, 0, 0, w2, h2, cbtoken, w2 * 4);
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w*4);
+ gen.ldsrc(0, 1, 0, 0, w2, h2, crtoken, w2 * 4);
+ }
+ break;
+
+ case kVDPixType_V210:
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, ((w + 5) / 6) * 4);
+ break;
+
+ case kVDPixType_8_B8R8:
+ {
+ uint32 ytoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8;
+ uint32 ctoken = (srcToken & ~kVDPixType_Mask) | kVDPixType_B8R8;
+
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+ int cxbits = sampInfo.mCXBits;
+ int cybits = sampInfo.mCYBits;
+ int w2 = -(-w >> cxbits);
+ int h2 = -(-h >> cybits);
+ gen.ldsrc(0, 0, 0, 0, w, h, srcToken, w);
+ gen.ldsrc(0, 1, 0, 0, w2, h2, ctoken, w2*2);
+ }
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+
+ // check if we need a color space change
+ if ((srcToken ^ dstToken) & kVDPixSpace_Mask) {
+ // first, if we're dealing with an interleaved format, deinterleave it
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_B8G8_R8G8:
+ gen.dup();
+ gen.dup();
+ gen.extract_8in32(2, (w + 1) >> 1, h);
+ gen.swap(2);
+ gen.extract_8in16(1, w, h);
+ gen.swap(1);
+ gen.extract_8in32(0, (w + 1) >> 1, h);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+ break;
+
+ case kVDPixType_G8B8_G8R8:
+ gen.dup();
+ gen.dup();
+ gen.extract_8in32(3, (w + 1) >> 1, h);
+ gen.swap(2);
+ gen.extract_8in16(0, w, h);
+ gen.swap(1);
+ gen.extract_8in32(1, (w + 1) >> 1, h);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+ break;
+
+ case kVDPixType_8_B8R8:
+ gen.dup();
+ gen.extract_8in16(1, (w + 1) >> 1, (h + 1) >> 1);
+ gen.swap(2);
+ gen.swap(1);
+ gen.extract_8in16(0, (w + 1) >> 1, (h + 1) >> 1);
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_8_8_8;
+ break;
+
+ case kVDPixType_V210:
+ gen.conv_V210_to_32F();
+ srcToken = (srcToken & ~kVDPixType_Mask) | kVDPixType_32F_32F_32F_LE;
+ break;
+ }
+
+ // if the source is subsampled, converge on 4:4:4 subsampling, but only if we actually need
+ // the auxiliary channels
+ const VDPixmapSamplingInfo& sampInfo = VDPixmapGetSamplingInfo(srcToken);
+
+ if ((dstToken & kVDPixSpace_Mask) != kVDPixSpace_Y_601 && (dstToken & kVDPixSpace_Mask) != kVDPixSpace_Y_709) {
+ if (sampInfo.mCXBits | sampInfo.mCYBits | sampInfo.mCXOffset16 | sampInfo.mCbYOffset16 | sampInfo.mCrYOffset16)
+ srcToken = BlitterConvertSampling(gen, srcToken, kVDPixSamp_444, w, h);
+ }
+
+ // change color spaces
+ uint32 dstSpace = dstToken & kVDPixSpace_Mask;
+ while((srcToken ^ dstToken) & kVDPixSpace_Mask) {
+ uint32 srcSpace = srcToken & kVDPixSpace_Mask;
+ uint32 targetSpace = dstSpace;
+
+space_reconvert:
+ switch(targetSpace) {
+ case kVDPixSpace_BGR:
+ switch(srcSpace) {
+ case kVDPixSpace_YCC_709:
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_8_8_8:
+ gen.ycbcr709_to_rgb32();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_32F_32F_32F_LE, w, h);
+ gen.ycbcr709_to_rgb32_32f();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+ break;
+
+ case kVDPixType_32F_32F_32F_LE:
+ gen.ycbcr709_to_rgb32_32f();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ case kVDPixSpace_YCC_601:
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_8_8_8:
+ gen.ycbcr601_to_rgb32();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ case kVDPixType_16F_16F_16F_LE:
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_32F_32F_32F_LE, w, h);
+ gen.ycbcr601_to_rgb32_32f();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+ break;
+
+ case kVDPixType_32F_32F_32F_LE:
+ gen.ycbcr601_to_rgb32_32f();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_32Fx4_LE;
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ case kVDPixSpace_Y_601:
+ targetSpace = kVDPixSpace_YCC_601;
+ goto space_reconvert;
+
+ case kVDPixSpace_Pal:
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_1:
+ gen.conv_Pal1_to_8888(0);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ case kVDPixType_2:
+ gen.conv_Pal2_to_8888(0);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ case kVDPixType_4:
+ gen.conv_Pal4_to_8888(0);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ case kVDPixType_8:
+ gen.conv_Pal8_to_8888(0);
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_BGR | kVDPixType_8888;
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+ case kVDPixSpace_Y_601:
+ if (srcSpace == kVDPixSpace_YCC_601) {
+ gen.pop();
+ gen.swap(1);
+ gen.pop();
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_32F_32F_32F_LE:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_32F_LE;
+ break;
+ case kVDPixType_16F_16F_16F_LE:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_16F_LE;
+ break;
+ case kVDPixType_8_8_8:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_601 | kVDPixType_8;
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8, w, h);
+ break;
+ } else if (srcSpace == kVDPixSpace_YCC_709) {
+ gen.pop();
+ gen.swap(1);
+ gen.pop();
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_32F_32F_32F_LE:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_32F_LE;
+ break;
+ case kVDPixType_16F_16F_16F_LE:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_16F_LE;
+ break;
+ case kVDPixType_8_8_8:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_Y_709 | kVDPixType_8;
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8, w, h);
+ break;
+ }
+ // fall through
+ case kVDPixSpace_YCC_601:
+ switch(srcSpace) {
+ case kVDPixSpace_BGR:
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8888, w, h);
+ gen.rgb32_to_ycbcr601();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_601 | kVDPixType_8_8_8;
+ break;
+ case kVDPixSpace_Y_601:
+ case kVDPixSpace_Y_709:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_601 | kVDPixType_8;
+
+ {
+ const VDPixmapSamplingInfo& sinfo = VDPixmapGetSamplingInfo(dstToken);
+ int cw = ((w - 1) >> sinfo.mCXBits) + 1;
+ int ch = ((h - 1) >> sinfo.mCYBits) + 1;
+
+ gen.ldconst(0x80, cw, cw, ch, srcToken);
+ }
+
+ gen.dup();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = kVDPixSpace_YCC_601 | kVDPixType_8_8_8 | (dstToken & kVDPixSamp_Mask);
+ break;
+ case kVDPixSpace_YCC_709:
+ VDASSERT((srcToken & kVDPixType_Mask) == kVDPixType_8_8_8);
+ gen.ycbcr709_to_ycbcr601();
+ srcToken = (srcToken & ~kVDPixSpace_Mask) | kVDPixSpace_YCC_601;
+ break;
+
+ case kVDPixSpace_Pal:
+ targetSpace = kVDPixSpace_BGR;
+ goto space_reconvert;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+ case kVDPixSpace_YCC_709:
+ switch(srcSpace) {
+ case kVDPixSpace_BGR:
+ srcToken = BlitterConvertType(gen, srcToken, kVDPixType_8888, w, h);
+ gen.rgb32_to_ycbcr709();
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_709 | kVDPixType_8_8_8;
+ break;
+ case kVDPixSpace_Y_709:
+ case kVDPixSpace_Y_601:
+ srcToken = (srcToken & ~(kVDPixType_Mask | kVDPixSpace_Mask)) | kVDPixSpace_YCC_709 | kVDPixType_8;
+
+ {
+ const VDPixmapSamplingInfo& sinfo = VDPixmapGetSamplingInfo(dstToken);
+ int cw = ((w - 1) >> sinfo.mCXBits) + 1;
+ int ch = ((h - 1) >> sinfo.mCYBits) + 1;
+
+ gen.ldconst(0x80, cw, cw, ch, srcToken);
+ }
+
+ gen.dup();
+ gen.swap(2);
+ gen.swap(1);
+ srcToken = kVDPixSpace_YCC_709 | kVDPixType_8_8_8 | (dstToken & kVDPixSamp_Mask);
+ break;
+ case kVDPixSpace_YCC_601:
+ VDASSERT((srcToken & kVDPixType_Mask) == kVDPixType_8_8_8 || (srcToken & kVDPixType_Mask) == kVDPixType_32F_32F_32F_LE);
+ gen.ycbcr601_to_ycbcr709();
+ srcToken = (srcToken & ~kVDPixSpace_Mask) | kVDPixSpace_YCC_709;
+ break;
+ case kVDPixSpace_Pal:
+ targetSpace = kVDPixSpace_BGR;
+ goto space_reconvert;
+ default:
+ VDASSERT(false);
+ break;
+ }
+ break;
+
+ default:
+ VDASSERT(false);
+ break;
+ }
+ }
+ }
+
+ // check if we need a type change
+ //
+ // Note: If the sampling is also different, we have to be careful about what types we
+ // target. The type conversion may itself involve a sampling conversion, so things get
+ // VERY tricky here.
+ if ((srcToken ^ dstToken) & kVDPixType_Mask) {
+ bool samplingDifferent = 0 != ((srcToken ^ dstToken) & kVDPixSamp_Mask);
+ uint32 intermediateTypeToken = dstToken & kVDPixType_Mask;
+
+ if (samplingDifferent) {
+ switch(dstToken & kVDPixType_Mask) {
+ case kVDPixType_16F_16F_16F_LE:
+ intermediateTypeToken = kVDPixType_32F_32F_32F_LE;
+ break;
+ case kVDPixType_8_B8R8:
+ intermediateTypeToken = kVDPixType_8_8_8;
+ break;
+ }
+ }
+
+ srcToken = BlitterConvertType(gen, srcToken, (dstToken & ~kVDPixType_Mask) | intermediateTypeToken, w, h);
+ }
+
+ // convert subsampling if necessary
+ switch(srcToken & kVDPixType_Mask) {
+ case kVDPixType_8_8_8:
+ case kVDPixType_16F_16F_16F_LE:
+ case kVDPixType_32F_32F_32F_LE:
+ if ((srcToken ^ dstToken) & kVDPixSamp_Mask)
+ srcToken = BlitterConvertSampling(gen, srcToken, dstToken, w, h);
+ break;
+ }
+
+ // check if we need a type change (possible with 16F)
+ srcToken = BlitterConvertType(gen, srcToken, dstToken, w, h);
+
+ return gen.create();
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp
new file mode 100644
index 000000000..3e9af1a1b
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_16f.cpp
@@ -0,0 +1,40 @@
+#include <vd2/system/halffloat.h>
+#include "uberblit_16f.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_32F_To_16F::Start() {
+ StartWindow(mWidth * sizeof(uint16));
+}
+
+uint32 VDPixmapGen_32F_To_16F::GetType(uint32 output) const {
+ return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_16F_LE;
+}
+
+void VDPixmapGen_32F_To_16F::Compute(void *dst0, sint32 y) {
+ uint16 *dst = (uint16 *)dst0;
+ const float *src = (const float *)mpSrc->GetRow(y, mSrcIndex);
+ uint32 w = mWidth;
+
+ for(uint32 i=0; i<w; ++i)
+ *dst++ = VDConvertFloatToHalf(src++);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_16F_To_32F::Start() {
+ StartWindow(mWidth * sizeof(float));
+}
+
+uint32 VDPixmapGen_16F_To_32F::GetType(uint32 output) const {
+ return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+}
+
+void VDPixmapGen_16F_To_32F::Compute(void *dst0, sint32 y) {
+ float *dst = (float *)dst0;
+ const uint16 *src = (const uint16 *)mpSrc->GetRow(y, mSrcIndex);
+ uint32 w = mWidth;
+
+ for(uint32 i=0; i<w; ++i)
+ VDConvertHalfToFloat(*src++, dst++);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp
new file mode 100644
index 000000000..f93ca322e
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_gen.cpp
@@ -0,0 +1,1597 @@
+#include <vd2/system/vdalloc.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include "uberblit.h"
+#include "uberblit_gen.h"
+#include "uberblit_fill.h"
+#include "uberblit_input.h"
+#include "uberblit_resample.h"
+#include "uberblit_resample_special.h"
+#include "uberblit_ycbcr.h"
+#include "uberblit_rgb.h"
+#include "uberblit_swizzle.h"
+#include "uberblit_pal.h"
+#include "uberblit_16f.h"
+#include "uberblit_v210.h"
+
+#ifdef VD_CPU_X86
+ #include "uberblit_swizzle_x86.h"
+ #include "uberblit_ycbcr_x86.h"
+ #include "uberblit_rgb_x86.h"
+ #include "uberblit_resample_special_x86.h"
+#endif
+
+void VDPixmapGenerate(void *dst, ptrdiff_t pitch, sint32 bpr, sint32 height, IVDPixmapGen *gen, int genIndex) {
+ for(sint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, genIndex), bpr);
+ vdptrstep(dst, pitch);
+ }
+ VDCPUCleanupExtensions();
+}
+
+void VDPixmapGenerateFast(void *dst, ptrdiff_t pitch, sint32 height, IVDPixmapGen *gen) {
+ for(sint32 y=0; y<height; ++y) {
+ gen->ProcessRow(dst, y);
+ vdptrstep(dst, pitch);
+ }
+ VDCPUCleanupExtensions();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+IVDPixmapBlitter *VDCreatePixmapUberBlitterDirectCopy(const VDPixmap& dst, const VDPixmap& src) {
+ return new VDPixmapUberBlitterDirectCopy;
+}
+
+IVDPixmapBlitter *VDCreatePixmapUberBlitterDirectCopy(const VDPixmapLayout& dst, const VDPixmapLayout& src) {
+ return new VDPixmapUberBlitterDirectCopy;
+}
+
+VDPixmapUberBlitterDirectCopy::VDPixmapUberBlitterDirectCopy() {
+}
+
+VDPixmapUberBlitterDirectCopy::~VDPixmapUberBlitterDirectCopy() {
+}
+
+void VDPixmapUberBlitterDirectCopy::Blit(const VDPixmap& dst, const VDPixmap& src) {
+ Blit(dst, NULL, src);
+}
+
+void VDPixmapUberBlitterDirectCopy::Blit(const VDPixmap& dst, const vdrect32 *rDst, const VDPixmap& src) {
+ VDASSERT(dst.format == src.format);
+
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+
+ void *p = dst.data;
+ void *p2 = dst.data2;
+ void *p3 = dst.data3;
+ int w = dst.w;
+ int h = dst.h;
+
+ if (formatInfo.qchunky) {
+ w = (w + formatInfo.qw - 1) / formatInfo.qw;
+ h = -(-h >> formatInfo.qhbits);
+ }
+
+ int w2 = -(-dst.w >> formatInfo.auxwbits);
+ int h2 = -(-dst.h >> formatInfo.auxhbits);
+
+ if (rDst) {
+ int x1 = rDst->left;
+ int y1 = rDst->top;
+ int x2 = rDst->right;
+ int y2 = rDst->bottom;
+
+ VDASSERT(x1 >= 0 && y1 >= 0 && x2 <= w && y2 <= h && x2 >= x1 && y2 >= y1);
+
+ if (x2 < x1 || y2 < y1)
+ return;
+
+ p = vdptroffset(dst.data, dst.pitch * y1 + x1 * formatInfo.qsize);
+ w = x2 - x1;
+ h = y2 - y1;
+
+ if (formatInfo.auxbufs >= 1) {
+ VDASSERT(!((x1|x2) & ((1 << formatInfo.auxwbits) - 1)));
+ VDASSERT(!((y1|y2) & ((1 << formatInfo.auxhbits) - 1)));
+
+ int ax1 = x1 >> formatInfo.auxwbits;
+ int ay1 = y1 >> formatInfo.auxhbits;
+ int ax2 = x2 >> formatInfo.auxwbits;
+ int ay2 = y2 >> formatInfo.auxhbits;
+
+ p2 = vdptroffset(dst.data2, dst.pitch2 * ay1 + ax1);
+ w2 = ax2 - ax1;
+ h2 = ay2 - ay1;
+
+ if (formatInfo.auxbufs >= 2)
+ p3 = vdptroffset(dst.data3, dst.pitch3 * ay1 + ax1);
+ }
+ }
+
+ uint32 bpr = formatInfo.qsize * w;
+
+ VDMemcpyRect(p, dst.pitch, src.data, src.pitch, bpr, h);
+
+ if (formatInfo.auxbufs >= 1) {
+ VDMemcpyRect(p2, dst.pitch2, src.data2, src.pitch2, w2 * formatInfo.auxsize, h2);
+
+ if (formatInfo.auxbufs >= 2)
+ VDMemcpyRect(p3, dst.pitch3, src.data3, src.pitch3, w2 * formatInfo.auxsize, h2);
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+VDPixmapUberBlitter::VDPixmapUberBlitter() {
+}
+
+VDPixmapUberBlitter::~VDPixmapUberBlitter() {
+ while(!mGenerators.empty()) {
+ delete mGenerators.back();
+ mGenerators.pop_back();
+ }
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const VDPixmap& src) {
+ Blit(dst, NULL, src);
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const vdrect32 *rDst, const VDPixmap& src) {
+ for(Sources::const_iterator it(mSources.begin()), itEnd(mSources.end()); it!=itEnd; ++it) {
+ const SourceEntry& se = *it;
+ const void *p;
+ ptrdiff_t pitch;
+
+ switch(se.mSrcPlane) {
+ case 0:
+ p = src.data;
+ pitch = src.pitch;
+ break;
+ case 1:
+ p = src.data2;
+ pitch = src.pitch2;
+ break;
+ case 2:
+ p = src.data3;
+ pitch = src.pitch3;
+ break;
+ default:
+ VDASSERT(false);
+ break;
+ }
+
+ se.mpSrc->SetSource((const char *)p + pitch*se.mSrcY + se.mSrcX, pitch, src.palette);
+ }
+
+ if (mOutputs[2].mpSrc) {
+ if (mbIndependentPlanes)
+ Blit3Separated(dst, rDst);
+ else if (mbIndependentChromaPlanes)
+ Blit3Split(dst, rDst);
+ else
+ Blit3(dst, rDst);
+ } else if (mOutputs[1].mpSrc) {
+ if (mbIndependentPlanes)
+ Blit2Separated(dst, rDst);
+ else
+ Blit2(dst, rDst);
+ } else
+ Blit(dst, rDst);
+}
+
+void VDPixmapUberBlitter::Blit(const VDPixmap& dst, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(dst.format);
+
+ mOutputs[0].mpSrc->AddWindowRequest(0, 0);
+ mOutputs[0].mpSrc->Start();
+
+ void *p = dst.data;
+ int w = dst.w;
+ int h = dst.h;
+
+ if (formatInfo.qchunky) {
+ w = (w + formatInfo.qw - 1) / formatInfo.qw;
+ h = -(-h >> formatInfo.qhbits);
+ }
+
+ if (rDst) {
+ int x1 = rDst->left;
+ int y1 = rDst->top;
+ int x2 = rDst->right;
+ int y2 = rDst->bottom;
+
+ if (formatInfo.qchunky) {
+ x1 = x1 / formatInfo.qw;
+ y1 = y1 / formatInfo.qh;
+ x2 = (x2 + formatInfo.qw - 1) / formatInfo.qw;
+ y2 = (y2 + formatInfo.qh - 1) / formatInfo.qh;
+ }
+
+ VDASSERT(x1 >= 0 && y1 >= 0 && x2 <= w && y2 <= h && x2 >= x1 && y2 >= y1);
+
+ if (x2 < x1 || y2 < y1)
+ return;
+
+ p = vdptroffset(dst.data, dst.pitch * y1 + x1 * formatInfo.qsize);
+ w = x2 - x1;
+ h = y2 - y1;
+ }
+
+ uint32 bpr = formatInfo.qsize * w;
+
+ if (mOutputs[0].mSrcIndex == 0)
+ VDPixmapGenerateFast(p, dst.pitch, h, mOutputs[0].mpSrc);
+ else
+ VDPixmapGenerate(p, dst.pitch, bpr, h, mOutputs[0].mpSrc, mOutputs[0].mSrcIndex);
+}
+
+void VDPixmapUberBlitter::Blit3(const VDPixmap& px, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+ IVDPixmapGen *gen = mOutputs[1].mpSrc;
+ int idx = mOutputs[1].mSrcIndex;
+ IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+ int idx1 = mOutputs[2].mSrcIndex;
+ IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+ int idx2 = mOutputs[0].mSrcIndex;
+
+ gen->AddWindowRequest(0, 0);
+ gen->Start();
+ gen1->AddWindowRequest(0, 0);
+ gen1->Start();
+ gen2->AddWindowRequest(0, 0);
+ gen2->Start();
+
+ uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+ uint32 auxaccum = 0;
+
+ auxstep += auxstep;
+
+ int qw = px.w;
+ int qh = px.h;
+
+ if (formatInfo.qchunky) {
+ qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+ qh = -(-qh >> formatInfo.qhbits);
+ }
+
+ uint32 height = qh;
+ uint32 bpr = formatInfo.qsize * qw;
+ uint32 bpr2 = formatInfo.auxsize * -(-px.w >> formatInfo.auxwbits);
+ uint8 *dst = (uint8 *)px.data;
+ uint8 *dst2 = (uint8 *)px.data2;
+ uint8 *dst3 = (uint8 *)px.data3;
+ ptrdiff_t pitch = px.pitch;
+ ptrdiff_t pitch2 = px.pitch2;
+ ptrdiff_t pitch3 = px.pitch3;
+ uint32 y2 = 0;
+ for(uint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, idx), bpr);
+ vdptrstep(dst, pitch);
+
+ if (!auxaccum) {
+ memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+ vdptrstep(dst2, pitch2);
+ memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+ vdptrstep(dst3, pitch3);
+ ++y2;
+ }
+
+ auxaccum += auxstep;
+ }
+
+ VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit3Split(const VDPixmap& px, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+ IVDPixmapGen *gen = mOutputs[1].mpSrc;
+ int idx = mOutputs[1].mSrcIndex;
+ IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+ int idx1 = mOutputs[2].mSrcIndex;
+ IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+ int idx2 = mOutputs[0].mSrcIndex;
+
+ gen->AddWindowRequest(0, 0);
+ gen->Start();
+ gen1->AddWindowRequest(0, 0);
+ gen1->Start();
+ gen2->AddWindowRequest(0, 0);
+ gen2->Start();
+
+ uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+ uint32 auxaccum = 0;
+
+ auxstep += auxstep;
+
+ int qw = px.w;
+ int qh = px.h;
+
+ if (formatInfo.qchunky) {
+ qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+ qh = -(-qh >> formatInfo.qhbits);
+ }
+
+ uint32 height = qh;
+ uint32 bpr = formatInfo.qsize * qw;
+ uint8 *dst = (uint8 *)px.data;
+ ptrdiff_t pitch = px.pitch;
+
+ if (idx == 0) {
+ for(uint32 y=0; y<height; ++y) {
+ gen->ProcessRow(dst, y);
+ vdptrstep(dst, pitch);
+ }
+ } else {
+ for(uint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, idx), bpr);
+ vdptrstep(dst, pitch);
+ }
+ }
+
+ uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+ uint8 *dst2 = (uint8 *)px.data2;
+ uint8 *dst3 = (uint8 *)px.data3;
+ ptrdiff_t pitch2 = px.pitch2;
+ ptrdiff_t pitch3 = px.pitch3;
+ uint32 y2 = 0;
+ for(uint32 y=0; y<height; ++y) {
+ if (!auxaccum) {
+ memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+ vdptrstep(dst2, pitch2);
+ memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+ vdptrstep(dst3, pitch3);
+ ++y2;
+ }
+
+ auxaccum += auxstep;
+ }
+
+ VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit3Separated(const VDPixmap& px, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+ IVDPixmapGen *gen = mOutputs[1].mpSrc;
+ int idx = mOutputs[1].mSrcIndex;
+ IVDPixmapGen *gen1 = mOutputs[2].mpSrc;
+ int idx1 = mOutputs[2].mSrcIndex;
+ IVDPixmapGen *gen2 = mOutputs[0].mpSrc;
+ int idx2 = mOutputs[0].mSrcIndex;
+
+ gen->AddWindowRequest(0, 0);
+ gen->Start();
+ gen1->AddWindowRequest(0, 0);
+ gen1->Start();
+ gen2->AddWindowRequest(0, 0);
+ gen2->Start();
+
+ int qw = px.w;
+ int qh = px.h;
+
+ if (formatInfo.qchunky) {
+ qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+ qh = -(-qh >> formatInfo.qhbits);
+ }
+
+ uint32 height = qh;
+ uint32 bpr = formatInfo.qsize * qw;
+ uint8 *dst = (uint8 *)px.data;
+ ptrdiff_t pitch = px.pitch;
+
+ if (idx == 0) {
+ for(uint32 y=0; y<height; ++y) {
+ gen->ProcessRow(dst, y);
+ vdptrstep(dst, pitch);
+ }
+ } else {
+ for(uint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, idx), bpr);
+ vdptrstep(dst, pitch);
+ }
+ }
+
+ uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+ uint32 h2 = -(-px.h >> formatInfo.auxhbits);
+ uint8 *dst2 = (uint8 *)px.data2;
+ ptrdiff_t pitch2 = px.pitch2;
+ if (idx1 == 0) {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ gen1->ProcessRow(dst2, y2);
+ vdptrstep(dst2, pitch2);
+ }
+ } else {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+ vdptrstep(dst2, pitch2);
+ }
+ }
+
+ uint8 *dst3 = (uint8 *)px.data3;
+ ptrdiff_t pitch3 = px.pitch3;
+ if (idx2 == 0) {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ gen2->ProcessRow(dst3, y2);
+ vdptrstep(dst3, pitch3);
+ }
+ } else {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ memcpy(dst3, gen2->GetRow(y2, idx2), bpr2);
+ vdptrstep(dst3, pitch3);
+ }
+ }
+
+ VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit2(const VDPixmap& px, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+ IVDPixmapGen *gen = mOutputs[0].mpSrc;
+ int idx = mOutputs[0].mSrcIndex;
+ IVDPixmapGen *gen1 = mOutputs[1].mpSrc;
+ int idx1 = mOutputs[1].mSrcIndex;
+
+ gen->AddWindowRequest(0, 0);
+ gen->Start();
+ gen1->AddWindowRequest(0, 0);
+ gen1->Start();
+
+ uint32 auxstep = 0x80000000UL >> formatInfo.auxhbits;
+ uint32 auxaccum = 0;
+
+ auxstep += auxstep;
+
+ int qw = px.w;
+ int qh = px.h;
+
+ if (formatInfo.qchunky) {
+ qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+ qh = -(-qh >> formatInfo.qhbits);
+ }
+
+ uint32 height = qh;
+ uint32 bpr = formatInfo.qsize * qw;
+ uint32 bpr2 = formatInfo.auxsize * -(-px.w >> formatInfo.auxwbits);
+ uint8 *dst = (uint8 *)px.data;
+ uint8 *dst2 = (uint8 *)px.data2;
+ ptrdiff_t pitch = px.pitch;
+ ptrdiff_t pitch2 = px.pitch2;
+ uint32 y2 = 0;
+ for(uint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, idx), bpr);
+ vdptrstep(dst, pitch);
+
+ if (!auxaccum) {
+ memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+ vdptrstep(dst2, pitch2);
+ ++y2;
+ }
+
+ auxaccum += auxstep;
+ }
+
+ VDCPUCleanupExtensions();
+}
+
+void VDPixmapUberBlitter::Blit2Separated(const VDPixmap& px, const vdrect32 *rDst) {
+ const VDPixmapFormatInfo& formatInfo = VDPixmapGetInfo(px.format);
+ IVDPixmapGen *gen = mOutputs[0].mpSrc;
+ int idx = mOutputs[0].mSrcIndex;
+ IVDPixmapGen *gen1 = mOutputs[1].mpSrc;
+ int idx1 = mOutputs[1].mSrcIndex;
+
+ gen->AddWindowRequest(0, 0);
+ gen->Start();
+ gen1->AddWindowRequest(0, 0);
+ gen1->Start();
+
+ int qw = px.w;
+ int qh = px.h;
+
+ if (formatInfo.qchunky) {
+ qw = (qw + formatInfo.qw - 1) / formatInfo.qw;
+ qh = -(-qh >> formatInfo.qhbits);
+ }
+
+ uint32 height = qh;
+ uint32 bpr = formatInfo.qsize * qw;
+ uint8 *dst = (uint8 *)px.data;
+ ptrdiff_t pitch = px.pitch;
+
+ if (idx == 0) {
+ for(uint32 y=0; y<height; ++y) {
+ gen->ProcessRow(dst, y);
+ vdptrstep(dst, pitch);
+ }
+ } else {
+ for(uint32 y=0; y<height; ++y) {
+ memcpy(dst, gen->GetRow(y, idx), bpr);
+ vdptrstep(dst, pitch);
+ }
+ }
+
+ uint32 bpr2 = -(-px.w >> formatInfo.auxwbits) * formatInfo.auxsize;
+ uint32 h2 = -(-px.h >> formatInfo.auxhbits);
+ uint8 *dst2 = (uint8 *)px.data2;
+ ptrdiff_t pitch2 = px.pitch2;
+ if (idx1 == 0) {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ gen1->ProcessRow(dst2, y2);
+ vdptrstep(dst2, pitch2);
+ }
+ } else {
+ for(uint32 y2=0; y2<h2; ++y2) {
+ memcpy(dst2, gen1->GetRow(y2, idx1), bpr2);
+ vdptrstep(dst2, pitch2);
+ }
+ }
+
+ VDCPUCleanupExtensions();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+VDPixmapUberBlitterGenerator::VDPixmapUberBlitterGenerator() {
+}
+
+VDPixmapUberBlitterGenerator::~VDPixmapUberBlitterGenerator() {
+ while(!mGenerators.empty()) {
+ delete mGenerators.back();
+ mGenerators.pop_back();
+ }
+}
+
+void VDPixmapUberBlitterGenerator::swap(int index) {
+ std::swap(mStack.back(), (&mStack.back())[-index]);
+}
+
+void VDPixmapUberBlitterGenerator::dup() {
+ mStack.push_back(mStack.back());
+}
+
+void VDPixmapUberBlitterGenerator::pop() {
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ldsrc(int srcIndex, int srcPlane, int x, int y, uint32 w, uint32 h, uint32 type, uint32 bpr) {
+ VDPixmapGenSrc *src = new VDPixmapGenSrc;
+
+ src->Init(w, h, type, bpr);
+
+ mGenerators.push_back(src);
+ mStack.push_back(StackEntry(src, 0));
+
+ SourceEntry se;
+ se.mpSrc = src;
+ se.mSrcIndex = srcIndex;
+ se.mSrcPlane = srcPlane;
+ se.mSrcX = x;
+ se.mSrcY = y;
+ mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::ldconst(uint8 fill, uint32 bpr, uint32 w, uint32 h, uint32 type) {
+ VDPixmapGenFill8 *src = new VDPixmapGenFill8;
+
+ src->Init(fill, bpr, w, h, type);
+
+ mGenerators.push_back(src);
+ mStack.push_back(StackEntry(src, 0));
+}
+
+void VDPixmapUberBlitterGenerator::extract_8in16(int offset, uint32 w, uint32 h) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_8In16 *src = NULL;
+
+#if VD_CPU_X86
+ if (MMX_enabled) {
+ if (offset == 0)
+ src = new VDPixmapGen_8In16_Even_MMX;
+ else if (offset == 1)
+ src = new VDPixmapGen_8In16_Odd_MMX;
+ }
+#endif
+ if (!src)
+ src = new VDPixmapGen_8In16;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, offset, w, h);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::extract_8in32(int offset, uint32 w, uint32 h) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_8In32 *src = NULL;
+
+#if VD_CPU_X86
+ if (MMX_enabled) {
+ if ((unsigned)offset < 4)
+ src = new VDPixmapGen_8In32_MMX;
+ }
+#endif
+
+ if (!src)
+ src = new VDPixmapGen_8In32;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, offset, w, h);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::swap_8in16(uint32 w, uint32 h, uint32 bpr) {
+ StackEntry *args = &mStack.back();
+
+#if VD_CPU_X86
+ VDPixmapGen_Swap8In16 *src = MMX_enabled ? new VDPixmapGen_Swap8In16_MMX : new VDPixmapGen_Swap8In16;
+#else
+ VDPixmapGen_Swap8In16 *src = new VDPixmapGen_Swap8In16;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, w, h, bpr);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal1_to_8888(int srcIndex) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_Pal1_To_X8R8G8B8 *src = new VDPixmapGen_Pal1_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+
+ SourceEntry se;
+ se.mpSrc = src;
+ se.mSrcIndex = srcIndex;
+ se.mSrcPlane = 0;
+ se.mSrcX = 0;
+ se.mSrcY = 0;
+ mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal2_to_8888(int srcIndex) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_Pal2_To_X8R8G8B8 *src = new VDPixmapGen_Pal2_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+
+ SourceEntry se;
+ se.mpSrc = src;
+ se.mSrcIndex = srcIndex;
+ se.mSrcPlane = 0;
+ se.mSrcX = 0;
+ se.mSrcY = 0;
+ mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal4_to_8888(int srcIndex) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_Pal4_To_X8R8G8B8 *src = new VDPixmapGen_Pal4_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+
+ SourceEntry se;
+ se.mpSrc = src;
+ se.mSrcIndex = srcIndex;
+ se.mSrcPlane = 0;
+ se.mSrcX = 0;
+ se.mSrcY = 0;
+ mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::conv_Pal8_to_8888(int srcIndex) {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_Pal8_To_X8R8G8B8 *src = new VDPixmapGen_Pal8_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+
+ SourceEntry se;
+ se.mpSrc = src;
+ se.mSrcIndex = srcIndex;
+ se.mSrcPlane = 0;
+ se.mSrcX = 0;
+ se.mSrcY = 0;
+ mSources.push_back(se);
+}
+
+void VDPixmapUberBlitterGenerator::pointh(float xoffset, float xfactor, uint32 w) {
+ StackEntry *args = &mStack.back();
+
+ if (xoffset != 0.5f || xfactor != 1.0f) {
+ VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterPoint, 0, false);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::pointv(float yoffset, float yfactor, uint32 h) {
+ StackEntry *args = &mStack.back();
+
+ if (yoffset != 0.5f || yfactor != 1.0f) {
+ VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterPoint, 0, false);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::linearh(float xoffset, float xfactor, uint32 w, bool interpOnly) {
+ StackEntry *args = &mStack.back();
+ IVDPixmapGen *src = args[0].mpSrc;
+ int srcIndex = args[0].mSrcIndex;
+
+ sint32 srcw = src->GetWidth(srcIndex);
+ if (xoffset == 0.5f && xfactor == 1.0f && srcw == w)
+ return;
+
+ if (xoffset == 0.5f && (src->GetType(srcIndex) & kVDPixType_Mask) == kVDPixType_8) {
+ if (xfactor == 2.0f && w == ((srcw + 1) >> 1)) {
+ VDPixmapGenResampleRow_d2_p0_lin_u8 *out = new VDPixmapGenResampleRow_d2_p0_lin_u8;
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (xfactor == 4.0f && w == ((srcw + 3) >> 2)) {
+ VDPixmapGenResampleRow_d4_p0_lin_u8 *out = new VDPixmapGenResampleRow_d4_p0_lin_u8;
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (xfactor == 0.5f && w == srcw*2) {
+#if VD_CPU_X86
+ VDPixmapGenResampleRow_x2_p0_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleRow_x2_p0_lin_u8_ISSE : new VDPixmapGenResampleRow_x2_p0_lin_u8;
+#else
+ VDPixmapGenResampleRow_x2_p0_lin_u8 *out = new VDPixmapGenResampleRow_x2_p0_lin_u8;
+#endif
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (xfactor == 0.25f && w == srcw*4) {
+#if VD_CPU_X86
+ VDPixmapGenResampleRow_x4_p0_lin_u8 *out = MMX_enabled ? new VDPixmapGenResampleRow_x4_p0_lin_u8_MMX : new VDPixmapGenResampleRow_x4_p0_lin_u8;
+#else
+ VDPixmapGenResampleRow_x4_p0_lin_u8 *out = new VDPixmapGenResampleRow_x4_p0_lin_u8;
+#endif
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+ }
+
+ VDPixmapGenResampleRow *out = new VDPixmapGenResampleRow;
+
+ out->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterLinear, 0, interpOnly);
+
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+}
+
+void VDPixmapUberBlitterGenerator::linearv(float yoffset, float yfactor, uint32 h, bool interpOnly) {
+ StackEntry *args = &mStack.back();
+ IVDPixmapGen *src = args[0].mpSrc;
+ int srcIndex = args[0].mSrcIndex;
+
+ sint32 srch = src->GetHeight(srcIndex);
+ if (yoffset == 0.5f && yfactor == 1.0f && srch == h)
+ return;
+
+ if ((src->GetType(srcIndex) & kVDPixType_Mask) == kVDPixType_8) {
+ if (yoffset == 1.0f && yfactor == 2.0f && h == ((srch + 1) >> 1)) {
+ VDPixmapGenResampleCol_x2_phalf_lin_u8 *out = new VDPixmapGenResampleCol_x2_phalf_lin_u8;
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (yoffset == 2.0f && yfactor == 4.0f && h == ((srch + 2) >> 2)) {
+ VDPixmapGenResampleCol_x4_p1half_lin_u8 *out = new VDPixmapGenResampleCol_x4_p1half_lin_u8;
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (yoffset == 0.25f && yfactor == 0.5f && h == srch*2) {
+#if VD_CPU_X86
+ VDPixmapGenResampleCol_d2_pnqrtr_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8_ISSE : new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8;
+#else
+ VDPixmapGenResampleCol_d2_pnqrtr_lin_u8 *out = new VDPixmapGenResampleCol_d2_pnqrtr_lin_u8;
+#endif
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+
+ if (yoffset == 0.125f && yfactor == 0.25f && h == srch*4) {
+#if VD_CPU_X86
+ VDPixmapGenResampleCol_d4_pn38_lin_u8 *out = ISSE_enabled ? new VDPixmapGenResampleCol_d4_pn38_lin_u8_ISSE : new VDPixmapGenResampleCol_d4_pn38_lin_u8;
+#else
+ VDPixmapGenResampleCol_d4_pn38_lin_u8 *out = new VDPixmapGenResampleCol_d4_pn38_lin_u8;
+#endif
+
+ out->Init(src, srcIndex);
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+ return;
+ }
+ }
+
+ VDPixmapGenResampleCol *out = new VDPixmapGenResampleCol;
+
+ out->Init(src, srcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterLinear, 0, interpOnly);
+
+ mGenerators.push_back(out);
+ MarkDependency(out, src);
+ args[0] = StackEntry(out, 0);
+}
+
+void VDPixmapUberBlitterGenerator::linear(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h) {
+ linearh(xoffset, xfactor, w, false);
+ linearv(yoffset, yfactor, h, false);
+}
+
+void VDPixmapUberBlitterGenerator::cubich(float xoffset, float xfactor, uint32 w, float splineFactor, bool interpOnly) {
+ StackEntry *args = &mStack.back();
+
+ if (xoffset != 0.5f || xfactor != 1.0f) {
+ VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterCubic, splineFactor, interpOnly);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::cubicv(float yoffset, float yfactor, uint32 h, float splineFactor, bool interpOnly) {
+ StackEntry *args = &mStack.back();
+
+ if (yoffset != 0.5f || yfactor != 1.0f) {
+ VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterCubic, splineFactor, interpOnly);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::cubic(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h, float splineFactor) {
+ cubich(xoffset, xfactor, w, splineFactor, false);
+ cubicv(yoffset, yfactor, h, splineFactor, false);
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3h(float xoffset, float xfactor, uint32 w) {
+ StackEntry *args = &mStack.back();
+
+ if (xoffset != 0.5f || xfactor != 1.0f) {
+ VDPixmapGenResampleRow *src = new VDPixmapGenResampleRow;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, w, xoffset, xfactor, nsVDPixmap::kFilterLanczos3, 0, false);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3v(float yoffset, float yfactor, uint32 h) {
+ StackEntry *args = &mStack.back();
+
+ if (yoffset != 0.5f || yfactor != 1.0f) {
+ VDPixmapGenResampleCol *src = new VDPixmapGenResampleCol;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, h, yoffset, yfactor, nsVDPixmap::kFilterLanczos3, 0, false);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ }
+}
+
+void VDPixmapUberBlitterGenerator::lanczos3(float xoffset, float xfactor, uint32 w, float yoffset, float yfactor, uint32 h) {
+ lanczos3h(xoffset, xfactor, w);
+ lanczos3v(yoffset, yfactor, h);
+}
+
+void VDPixmapUberBlitterGenerator::conv_555_to_8888() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_X1R5G5B5_To_X8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_X1R5G5B5_To_X8R8G8B8_MMX : new VDPixmapGen_X1R5G5B5_To_X8R8G8B8;
+#else
+ VDPixmapGen_X1R5G5B5_To_X8R8G8B8 *src = new VDPixmapGen_X1R5G5B5_To_X8R8G8B8;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_565_to_8888() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_R5G6B5_To_X8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_R5G6B5_To_X8R8G8B8_MMX : new VDPixmapGen_R5G6B5_To_X8R8G8B8;
+#else
+ VDPixmapGen_R5G6B5_To_X8R8G8B8 *src = new VDPixmapGen_R5G6B5_To_X8R8G8B8;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_888_to_8888() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_R8G8B8_To_A8R8G8B8 *src = MMX_enabled ? new VDPixmapGen_R8G8B8_To_X8R8G8B8_MMX : new VDPixmapGen_R8G8B8_To_A8R8G8B8;
+#else
+ VDPixmapGen_R8G8B8_To_A8R8G8B8 *src = new VDPixmapGen_R8G8B8_To_A8R8G8B8;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8_to_32F() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_8_To_32F *src = new VDPixmapGen_8_To_32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_16F_to_32F() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_16F_To_32F *src = new VDPixmapGen_16F_To_32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_V210_to_32F() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_V210_To_32F *src = new VDPixmapGen_V210_To_32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.push_back(StackEntry(src, 1));
+ mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_X32F() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_X8R8G8B8_To_X32B32G32R32F *src = new VDPixmapGen_X8R8G8B8_To_X32B32G32R32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_555() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_X8R8G8B8_To_X1R5G5B5 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_X1R5G5B5_MMX : new VDPixmapGen_X8R8G8B8_To_X1R5G5B5;
+#else
+ VDPixmapGen_X8R8G8B8_To_X1R5G5B5 *src = new VDPixmapGen_X8R8G8B8_To_X1R5G5B5;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_555_to_565() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_X1R5G5B5_To_R5G6B5 *src = MMX_enabled ? new VDPixmapGen_X1R5G5B5_To_R5G6B5_MMX : new VDPixmapGen_X1R5G5B5_To_R5G6B5;
+#else
+ VDPixmapGen_X1R5G5B5_To_R5G6B5 *src = new VDPixmapGen_X1R5G5B5_To_R5G6B5;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_565_to_555() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_R5G6B5_To_X1R5G5B5 *src = MMX_enabled ? new VDPixmapGen_R5G6B5_To_X1R5G5B5_MMX : new VDPixmapGen_R5G6B5_To_X1R5G5B5;
+#else
+ VDPixmapGen_R5G6B5_To_X1R5G5B5 *src = new VDPixmapGen_R5G6B5_To_X1R5G5B5;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_565() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_X8R8G8B8_To_R5G6B5 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_R5G6B5_MMX : new VDPixmapGen_X8R8G8B8_To_R5G6B5;
+#else
+ VDPixmapGen_X8R8G8B8_To_R5G6B5 *src = new VDPixmapGen_X8R8G8B8_To_R5G6B5;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_8888_to_888() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGen_X8R8G8B8_To_R8G8B8 *src = MMX_enabled ? new VDPixmapGen_X8R8G8B8_To_R8G8B8_MMX : new VDPixmapGen_X8R8G8B8_To_R8G8B8;
+#else
+ VDPixmapGen_X8R8G8B8_To_R8G8B8 *src = new VDPixmapGen_X8R8G8B8_To_R8G8B8;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_8() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_32F_To_8 *src = new VDPixmapGen_32F_To_8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_X32F_to_8888() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_X32B32G32R32F_To_X8R8G8B8 *src = new VDPixmapGen_X32B32G32R32F_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_16F() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_32F_To_16F *src = new VDPixmapGen_32F_To_16F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::conv_32F_to_V210() {
+ StackEntry *args = &*(mStack.end() - 3);
+ VDPixmapGen_32F_To_V210 *src = new VDPixmapGen_32F_To_V210;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::convd_8888_to_555() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_X8R8G8B8_To_X1R5G5B5_Dithered *src = new VDPixmapGen_X8R8G8B8_To_X1R5G5B5_Dithered;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_8888_to_565() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_X8R8G8B8_To_R5G6B5_Dithered *src = new VDPixmapGen_X8R8G8B8_To_R5G6B5_Dithered;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_32F_to_8() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_32F_To_8_Dithered *src = new VDPixmapGen_32F_To_8_Dithered;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::convd_X32F_to_8888() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGen_X32B32G32R32F_To_X8R8G8B8_Dithered *src = new VDPixmapGen_X32B32G32R32F_To_X8R8G8B8_Dithered;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+}
+
+void VDPixmapUberBlitterGenerator::interleave_B8G8_R8G8() {
+ StackEntry *args = &mStack.back() - 2;
+ VDPixmapGen_B8x3_To_B8G8_R8G8 *src = NULL;
+
+#if VD_CPU_X86
+ if (MMX_enabled)
+ src = new VDPixmapGen_B8x3_To_B8G8_R8G8_MMX;
+#endif
+
+ if (!src)
+ src = new VDPixmapGen_B8x3_To_B8G8_R8G8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_G8B8_G8R8() {
+ StackEntry *args = &mStack.back() - 2;
+ VDPixmapGen_B8x3_To_G8B8_G8R8 *src = NULL;
+
+#if VD_CPU_X86
+ if (MMX_enabled)
+ src = new VDPixmapGen_B8x3_To_G8B8_G8R8_MMX;
+#endif
+
+ if (!src)
+ src = new VDPixmapGen_B8x3_To_G8B8_G8R8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_X8R8G8B8() {
+ StackEntry *args = &mStack.back() - 2;
+ VDPixmapGen_B8x3_To_X8R8G8B8 *src = new VDPixmapGen_B8x3_To_X8R8G8B8;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::interleave_B8R8() {
+ StackEntry *args = &mStack.back() - 1;
+
+#if VD_CPU_X86
+ VDPixmapGen_B8x2_To_B8R8 *src = MMX_enabled ? new VDPixmapGen_B8x2_To_B8R8_MMX : new VDPixmapGen_B8x2_To_B8R8;
+#else
+ VDPixmapGen_B8x2_To_B8R8 *src = new VDPixmapGen_B8x2_To_B8R8;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_rgb32() {
+ StackEntry *args = &mStack.back() - 2;
+
+#ifdef VD_CPU_X86
+ VDPixmapGenYCbCr601ToRGB32 *src = MMX_enabled ? new VDPixmapGenYCbCr601ToRGB32_MMX : new VDPixmapGenYCbCr601ToRGB32;
+#else
+ VDPixmapGenYCbCr601ToRGB32 *src = new VDPixmapGenYCbCr601ToRGB32;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_rgb32() {
+ StackEntry *args = &mStack.back() - 2;
+
+ VDPixmapGenYCbCr709ToRGB32 *src = new VDPixmapGenYCbCr709ToRGB32;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr601() {
+ StackEntry *args = &mStack.back();
+#ifdef VD_CPU_X86
+ VDPixmapGenRGB32ToYCbCr601 *src = SSE2_enabled ? new VDPixmapGenRGB32ToYCbCr601_SSE2 : new VDPixmapGenRGB32ToYCbCr601;
+#else
+ VDPixmapGenRGB32ToYCbCr601 *src = new VDPixmapGenRGB32ToYCbCr601;
+#endif
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.push_back(StackEntry(src, 1));
+ mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr709() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGenRGB32ToYCbCr709 *src = new VDPixmapGenRGB32ToYCbCr709;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.push_back(StackEntry(src, 1));
+ mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_rgb32_32f() {
+ StackEntry *args = &mStack.back() - 2;
+
+ VDPixmapGenYCbCr601ToRGB32F *src = new VDPixmapGenYCbCr601ToRGB32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_rgb32_32f() {
+ StackEntry *args = &mStack.back() - 2;
+
+ VDPixmapGenYCbCr709ToRGB32F *src = new VDPixmapGenYCbCr709ToRGB32F;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.pop_back();
+ mStack.pop_back();
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr601_32f() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGenRGB32FToYCbCr601 *src = new VDPixmapGenRGB32FToYCbCr601;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.push_back(StackEntry(src, 1));
+ mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::rgb32_to_ycbcr709_32f() {
+ StackEntry *args = &mStack.back();
+ VDPixmapGenRGB32FToYCbCr709 *src = new VDPixmapGenRGB32FToYCbCr709;
+
+ src->Init(args[0].mpSrc, args[0].mSrcIndex);
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ args[0] = StackEntry(src, 0);
+ mStack.push_back(StackEntry(src, 1));
+ mStack.push_back(StackEntry(src, 2));
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr601_to_ycbcr709() {
+ StackEntry *args = &mStack.back() - 2;
+
+ IVDPixmapGen *src;
+ if ((args[0].mpSrc->GetType(args[0].mSrcIndex) & kVDPixType_Mask) == kVDPixType_32F_LE) {
+ VDPixmapGenYCbCr601ToYCbCr709_32F *src2 = new VDPixmapGenYCbCr601ToYCbCr709_32F;
+
+ src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+ src = src2;
+ } else {
+ VDPixmapGenYCbCr601ToYCbCr709 *src2 = new VDPixmapGenYCbCr601ToYCbCr709;
+
+ src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+ src = src2;
+ }
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ args[1] = StackEntry(src, 1);
+ args[2] = StackEntry(src, 2);
+}
+
+void VDPixmapUberBlitterGenerator::ycbcr709_to_ycbcr601() {
+ StackEntry *args = &mStack.back() - 2;
+
+ IVDPixmapGen *src;
+ if ((args[0].mpSrc->GetType(args[0].mSrcIndex) & kVDPixType_Mask) == kVDPixType_32F_LE) {
+ VDPixmapGenYCbCr709ToYCbCr601_32F *src2 = new VDPixmapGenYCbCr709ToYCbCr601_32F;
+
+ src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+ src = src2;
+ } else {
+ VDPixmapGenYCbCr709ToYCbCr601 *src2 = new VDPixmapGenYCbCr709ToYCbCr601;
+
+ src2->Init(args[0].mpSrc, args[0].mSrcIndex, args[1].mpSrc, args[1].mSrcIndex, args[2].mpSrc, args[2].mSrcIndex);
+ src = src2;
+ }
+
+ mGenerators.push_back(src);
+ MarkDependency(src, args[0].mpSrc);
+ MarkDependency(src, args[1].mpSrc);
+ MarkDependency(src, args[2].mpSrc);
+ args[0] = StackEntry(src, 0);
+ args[1] = StackEntry(src, 1);
+ args[2] = StackEntry(src, 2);
+}
+
+IVDPixmapBlitter *VDPixmapUberBlitterGenerator::create() {
+ vdautoptr<VDPixmapUberBlitter> blitter(new VDPixmapUberBlitter);
+
+ int numStackEntries = (int)mStack.size();
+
+ for(int i=0; i<3; ++i) {
+ if (i < numStackEntries) {
+ blitter->mOutputs[i].mpSrc = mStack[i].mpSrc;
+ blitter->mOutputs[i].mSrcIndex = mStack[i].mSrcIndex;
+ } else {
+ blitter->mOutputs[i].mpSrc = NULL;
+ blitter->mOutputs[i].mSrcIndex = 0;
+ }
+ }
+
+ mStack.clear();
+
+ // If this blitter has three outputs, determine if outputs 1 and 2 are independent
+ // from output 0.
+ blitter->mbIndependentChromaPlanes = true;
+ blitter->mbIndependentPlanes = true;
+ if (numStackEntries >= 3) {
+ int numGens = mGenerators.size();
+ vdfastvector<uint8> genflags(numGens, 0);
+
+ enum {
+ kFlagStateful = 0x80,
+ kFlagY = 0x01,
+ kFlagCb = 0x02,
+ kFlagCr = 0x04,
+ kFlagYCbCr = 0x07
+ };
+
+ for(int i=0; i<3; ++i)
+ genflags[std::find(mGenerators.begin(), mGenerators.end(), blitter->mOutputs[i].mpSrc) - mGenerators.begin()] |= (1 << i);
+
+ for(int i=0; i<numGens; ++i) {
+ IVDPixmapGen *gen = mGenerators[i];
+
+ if (gen->IsStateful())
+ genflags[i] |= kFlagStateful;
+ }
+
+ while(!mDependencies.empty()) {
+ const Dependency& dep = mDependencies.back();
+
+ genflags[dep.mSrcIdx] |= (genflags[dep.mDstIdx] & ~kFlagStateful);
+
+ mDependencies.pop_back();
+ }
+
+ for(int i=0; i<numGens; ++i) {
+ uint8 flags = genflags[i];
+
+ if (!(flags & kFlagStateful))
+ continue;
+
+ switch(flags & kFlagYCbCr) {
+ case 0:
+ case kFlagY:
+ case kFlagCb:
+ case kFlagCr:
+ break;
+ case kFlagCr | kFlagCb:
+ blitter->mbIndependentPlanes = false;
+ break;
+ case kFlagCb | kFlagY:
+ case kFlagCr | kFlagY:
+ case kFlagCr | kFlagCb | kFlagY:
+ blitter->mbIndependentPlanes = false;
+ blitter->mbIndependentChromaPlanes = false;
+ break;
+ }
+ }
+ } else if (numStackEntries >= 2) {
+ int numGens = mGenerators.size();
+ vdfastvector<uint8> genflags(numGens, 0);
+
+ enum {
+ kFlagStateful = 0x80,
+ kFlagY = 0x01,
+ kFlagC = 0x02,
+ kFlagYC = 0x03
+ };
+
+ for(int i=0; i<2; ++i)
+ genflags[std::find(mGenerators.begin(), mGenerators.end(), blitter->mOutputs[i].mpSrc) - mGenerators.begin()] |= (1 << i);
+
+ for(int i=0; i<numGens; ++i) {
+ IVDPixmapGen *gen = mGenerators[i];
+
+ if (gen->IsStateful())
+ genflags[i] |= kFlagStateful;
+ }
+
+ while(!mDependencies.empty()) {
+ const Dependency& dep = mDependencies.back();
+
+ genflags[dep.mSrcIdx] |= (genflags[dep.mDstIdx] & ~kFlagStateful);
+
+ mDependencies.pop_back();
+ }
+
+ for(int i=0; i<numGens; ++i) {
+ uint8 flags = genflags[i];
+
+ if (!(flags & kFlagStateful))
+ continue;
+
+ switch(flags & kFlagYC) {
+ case kFlagYC:
+ blitter->mbIndependentPlanes = false;
+ blitter->mbIndependentChromaPlanes = false;
+ break;
+ }
+ }
+ }
+
+ blitter->mGenerators.swap(mGenerators);
+ blitter->mSources.swap(mSources);
+ return blitter.release();
+}
+
+void VDPixmapUberBlitterGenerator::MarkDependency(IVDPixmapGen *dst, IVDPixmapGen *src) {
+ Generators::const_iterator it1(std::find(mGenerators.begin(), mGenerators.end(), dst));
+ Generators::const_iterator it2(std::find(mGenerators.begin(), mGenerators.end(), src));
+
+ VDASSERT(it1 != mGenerators.end());
+ VDASSERT(it2 != mGenerators.end());
+
+ int idx1 = it1 - mGenerators.begin();
+ int idx2 = it2 - mGenerators.begin();
+
+ Dependency dep = { idx1, idx2 };
+
+ mDependencies.push_back(dep);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp
new file mode 100644
index 000000000..1363fb730
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample.cpp
@@ -0,0 +1,623 @@
+#include <float.h>
+#include <math.h>
+#include <vd2/system/vdstl.h>
+#include <vd2/system/memory.h>
+#include <vd2/system/math.h>
+#include <vd2/system/cpuaccel.h>
+#include <vd2/Kasumi/pixmap.h>
+#include <vd2/Kasumi/pixmaputils.h>
+#include <vd2/Kasumi/resample.h>
+
+#include <vd2/Kasumi/resample_kernels.h>
+#include "resample_stages_x86.h"
+#include "uberblit_resample.h"
+
+namespace {
+ sint32 scale32x32_fp16(sint32 x, sint32 y) {
+ return (sint32)(((sint64)x * y + 0x8000) >> 16);
+ }
+
+ template<class T>
+ IVDResamplerSeparableRowStage *RowFactory(double cutoff, float filterFactor) {
+ return new T;
+ }
+
+ template<class T>
+ IVDResamplerSeparableRowStage *RowFactoryLinear(double cutoff, float filterFactor) {
+ return new T(VDResamplerLinearFilter(cutoff));
+ }
+
+ template<class T>
+ IVDResamplerSeparableRowStage *RowFactoryCubic(double cutoff, float filterFactor) {
+ return new T(VDResamplerCubicFilter(cutoff, filterFactor));
+ }
+
+ template<class T>
+ IVDResamplerSeparableRowStage *RowFactoryCubic2(double cutoff, float filterFactor) {
+ return new T(filterFactor);
+ }
+
+ template<class T>
+ IVDResamplerSeparableRowStage *RowFactoryLanczos3(double cutoff, float filterFactor) {
+ return new T(VDResamplerLanczos3Filter(cutoff));
+ }
+
+ template<class T>
+ IVDResamplerSeparableColStage *ColFactory(double cutoff, float filterFactor) {
+ return new T;
+ }
+
+ template<class T>
+ IVDResamplerSeparableColStage *ColFactoryLinear(double cutoff, float filterFactor) {
+ return new T(VDResamplerLinearFilter(cutoff));
+ }
+
+ template<class T>
+ IVDResamplerSeparableColStage *ColFactoryCubic(double cutoff, float filterFactor) {
+ return new T(VDResamplerCubicFilter(cutoff, filterFactor));
+ }
+
+ template<class T>
+ IVDResamplerSeparableColStage *ColFactoryCubic2(double cutoff, float filterFactor) {
+ return new T(filterFactor);
+ }
+
+ template<class T>
+ IVDResamplerSeparableColStage *ColFactoryLanczos3(double cutoff, float filterFactor) {
+ return new T(VDResamplerLanczos3Filter(cutoff));
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDPixmapGenResampleRow
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapGenResampleRow::VDPixmapGenResampleRow()
+ : mpRowStage(NULL)
+ , mpRowStage2(NULL)
+{
+}
+
+VDPixmapGenResampleRow::~VDPixmapGenResampleRow() {
+ if (mpRowStage)
+ delete mpRowStage;
+}
+
+void VDPixmapGenResampleRow::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 width, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
+ InitSource(src, srcIndex);
+
+ sint32 u0 = (sint32)(offset * 65536.0);
+ sint32 dudx = (sint32)(step * 65536.0);
+
+ mAxis.Init(dudx);
+
+ double x_2fc = 1.0;
+ if (!interpolationOnly && step > 1.0f)
+ x_2fc = 1.0 / step;
+
+ struct SpecialCaseSpanRoutine {
+ sint32 mPhase;
+ sint32 mStep;
+ uint32 mType;
+ nsVDPixmap::FilterMode mFilterMode;
+ uint32 mCPUFlags;
+ IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+ };
+
+ static const SpecialCaseSpanRoutine kSpecialCaseSpanRoutines[]={
+ // Generic
+#if defined _M_IX86
+ { +0x0000, 0x008000, kVDPixType_8, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_INTEGER_SSE, RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf_ISSE> },
+#endif
+
+ { +0x0000, 0x008000, kVDPixType_8, nsVDPixmap::kFilterLinear, 0, RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf> },
+ };
+
+ long flags = CPUGetEnabledExtensions();
+ uint32 type = mpSrc->GetType(mSrcIndex) & kVDPixType_Mask;
+
+ for(int i=0; i<sizeof(kSpecialCaseSpanRoutines)/sizeof(kSpecialCaseSpanRoutines[0]); ++i) {
+ const SpecialCaseSpanRoutine& rout = kSpecialCaseSpanRoutines[i];
+
+ if (rout.mType != type)
+ continue;
+
+ if (x_2fc < 1.0)
+ continue;
+
+ if (rout.mStep != dudx)
+ continue;
+
+ if (rout.mPhase != u0)
+ continue;
+
+ if (rout.mFilterMode != filterMode)
+ continue;
+
+ if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+ continue;
+
+ mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
+ mpRowStage2 = mpRowStage->AsRowStage2();
+ break;
+ }
+
+ if (!mpRowStage) {
+ struct SpanRoutine {
+ uint32 mType;
+ bool mbInterpOnly;
+ nsVDPixmap::FilterMode mFilterMode;
+ uint32 mCPUFlags;
+ IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+ };
+
+ static const SpanRoutine kSpanRoutines[]={
+#if defined _M_IX86
+ // X86
+ { kVDPixType_8888, false, nsVDPixmap::kFilterPoint, CPUF_SUPPORTS_MMX, RowFactory<VDResamplerSeparablePointRowStageMMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterPoint, 0, RowFactory<VDResamplerSeparablePointRowStageX86> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE41, RowFactoryLinear<VDResamplerSeparableTableRowStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, RowFactoryLinear<VDResamplerSeparableTableRowStage8MMX> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, RowFactory<VDResamplerSeparableLinearRowStageMMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE2, RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, RowFactoryLinear<VDResamplerSeparableTableRowStageMMX> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE41, RowFactoryCubic<VDResamplerSeparableTableRowStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, RowFactoryCubic<VDResamplerSeparableTableRowStage8MMX> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, RowFactoryCubic2<VDResamplerSeparableCubicRowStageMMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE2, RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, RowFactoryCubic<VDResamplerSeparableTableRowStageMMX> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE41, RowFactoryLanczos3<VDResamplerSeparableTableRowStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_MMX, RowFactoryLanczos3<VDResamplerSeparableTableRowStage8MMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE2, RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_MMX, RowFactoryLanczos3<VDResamplerSeparableTableRowStageMMX> },
+#elif defined _M_AMD64
+ // AMD64
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE2, RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE2, RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE2, RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
+#endif
+ // Generic
+ { kVDPixType_8, false, nsVDPixmap::kFilterPoint, 0, RowFactory<VDResamplerRowStageSeparablePoint8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterPoint, 0, RowFactory<VDResamplerRowStageSeparablePoint32> },
+ { kVDPixType_8, true, nsVDPixmap::kFilterLinear, 0, RowFactory<VDResamplerRowStageSeparableLinear8> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterLinear, 0, RowFactory<VDResamplerRowStageSeparableLinear32> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, 0, RowFactoryLinear<VDResamplerRowStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, 0, RowFactoryLinear<VDResamplerRowStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterLinear, 0, RowFactoryLinear<VDResamplerRowStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterLinear, 0, RowFactoryLinear<VDResamplerRowStageSeparableTable32Fx4> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, 0, RowFactoryCubic<VDResamplerRowStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, 0, RowFactoryCubic<VDResamplerRowStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterCubic, 0, RowFactoryCubic<VDResamplerRowStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterCubic, 0, RowFactoryCubic<VDResamplerRowStageSeparableTable32Fx4> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, 0, RowFactoryLanczos3<VDResamplerRowStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, 0, RowFactoryLanczos3<VDResamplerRowStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterLanczos3, 0, RowFactoryLanczos3<VDResamplerRowStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterLanczos3, 0, RowFactoryLanczos3<VDResamplerRowStageSeparableTable32Fx4> },
+ };
+
+ for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
+ const SpanRoutine& rout = kSpanRoutines[i];
+
+ if (rout.mType != type)
+ continue;
+
+ if (rout.mbInterpOnly && x_2fc < 1.0)
+ continue;
+
+ if (rout.mFilterMode != filterMode)
+ continue;
+
+ if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+ continue;
+
+ mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
+ mpRowStage2 = mpRowStage->AsRowStage2();
+ break;
+ }
+ }
+
+ VDASSERT(mpRowStage);
+
+ mRowFiltW = mpRowStage->GetWindowSize();
+
+ mpSrc->AddWindowRequest(0, 0);
+
+ sint32 fsx1 = (sint32)(offset * 65536.0) - ((mRowFiltW-1) << 15);
+ mAxis.Compute(width, fsx1, mSrcWidth, mRowFiltW);
+ mWidth = width;
+
+ switch(type) {
+ case kVDPixType_8:
+ mBytesPerSample = 1;
+ break;
+ case kVDPixType_8888:
+ case kVDPixType_32F_LE:
+ mBytesPerSample = 4;
+ break;
+ case kVDPixType_32Fx4_LE:
+ mBytesPerSample = 16;
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+}
+
+void VDPixmapGenResampleRow::Start() {
+ StartWindow(mWidth * mBytesPerSample);
+
+ uint32 clipSpace = ((mRowFiltW*3*mBytesPerSample + 15) >> 4) << 2;
+ mTempSpace.resize(clipSpace);
+
+ if (mpRowStage2)
+ mpRowStage2->Init(mAxis, mSrcWidth);
+}
+
+void VDPixmapGenResampleRow::Compute(void *dst0, sint32 y) {
+ switch(mBytesPerSample) {
+ case 1:
+ Compute8(dst0, y);
+ break;
+ case 4:
+ Compute32(dst0, y);
+ break;
+ case 16:
+ Compute128(dst0, y);
+ break;
+ }
+}
+
+void VDPixmapGenResampleRow::Compute8(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+ uint8 *dst = (uint8 *)dst0;
+
+ // process pre-copy region
+ if (uint32 count = mAxis.dx_precopy) {
+ VDMemset8(dst, src[0], count);
+ dst += count;
+ }
+
+ uint8 *p = (uint8*)mTempSpace.data();
+ sint32 u = mAxis.u;
+ const sint32 dudx = mAxis.dudx;
+
+ // process dual-clip region
+ if (mpRowStage2) {
+ uint32 count = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
+ mpRowStage2->Process(dst, src, count);
+ dst += count;
+ } else if (uint32 count = mAxis.dx_dualclip) {
+ VDMemset8(p, src[0], mRowFiltW);
+ memcpy(p + mRowFiltW, src+1, (mSrcWidth-2));
+ VDMemset8(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count;
+ } else {
+ // process pre-clip region
+ if (uint32 count = mAxis.dx_preclip) {
+ VDMemset8(p, src[0], mRowFiltW);
+ memcpy(p + mRowFiltW, src+1, (mRowFiltW-1));
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count;
+ }
+
+ // process active region
+ if (uint32 count = mAxis.dx_active) {
+ mpRowStage->Process(dst, src, count, u, dudx);
+ u += dudx*count;
+ dst += count;
+ }
+
+ // process post-clip region
+ if (uint32 count = mAxis.dx_postclip) {
+ uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+ memcpy(p, src+offset, (mRowFiltW-1));
+ VDMemset8(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+ dst += count;
+ }
+ }
+
+ // process post-copy region
+ if (uint32 count = mAxis.dx_postcopy) {
+ VDMemset8(dst, src[mSrcWidth-1], count);
+ }
+}
+
+void VDPixmapGenResampleRow::Compute32(void *dst0, sint32 y) {
+ const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+ uint32 *dst = (uint32 *)dst0;
+
+ // process pre-copy region
+ if (uint32 count = mAxis.dx_precopy) {
+ VDMemset32(dst, src[0], count);
+ dst += count;
+ }
+
+ uint32 *p = mTempSpace.data();
+ sint32 u = mAxis.u;
+ const sint32 dudx = mAxis.dudx;
+
+ // process dual-clip region
+ if (uint32 count = mAxis.dx_dualclip) {
+ VDMemset32(p, src[0], mRowFiltW);
+ memcpy(p + mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32));
+ VDMemset32(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count;
+ } else if (mpRowStage2) {
+ mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
+ } else {
+ // process pre-clip region
+ if (uint32 count = mAxis.dx_preclip) {
+ VDMemset32(p, src[0], mRowFiltW);
+ memcpy(p + mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32));
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count;
+ }
+
+ // process active region
+ if (uint32 count = mAxis.dx_active) {
+ mpRowStage->Process(dst, src, count, u, dudx);
+ u += dudx*count;
+ dst += count;
+ }
+
+ // process post-clip region
+ if (uint32 count = mAxis.dx_postclip) {
+ uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+ memcpy(p, src+offset, (mRowFiltW-1)*sizeof(uint32));
+ VDMemset32(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+ dst += count;
+ }
+ }
+
+ // process post-copy region
+ if (uint32 count = mAxis.dx_postcopy) {
+ VDMemset32(dst, src[mSrcWidth-1], count);
+ }
+}
+
+void VDPixmapGenResampleRow::Compute128(void *dst0, sint32 y) {
+ const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+ uint32 *dst = (uint32 *)dst0;
+
+ // process pre-copy region
+ if (uint32 count = mAxis.dx_precopy) {
+ VDMemset128(dst, src, count);
+ dst += 4*count;
+ }
+
+ uint32 *p = mTempSpace.data();
+ sint32 u = mAxis.u;
+ const sint32 dudx = mAxis.dudx;
+
+ // process dual-clip region
+ if (uint32 count = mAxis.dx_dualclip) {
+ VDMemset128(p, src, mRowFiltW);
+ memcpy(p + 4*mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32)*4);
+ VDMemset128(p + 4*(mRowFiltW + (mSrcWidth-2)), src + 4*(mSrcWidth-1), mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count * 4;
+ } else if (mpRowStage2) {
+ mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
+ } else {
+ // process pre-clip region
+ if (uint32 count = mAxis.dx_preclip) {
+ VDMemset128(p, src, mRowFiltW);
+ memcpy(p + 4*mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32)*4);
+
+ mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
+ u += dudx*count;
+ dst += count*4;
+ }
+
+ // process active region
+ if (uint32 count = mAxis.dx_active) {
+ mpRowStage->Process(dst, src, count, u, dudx);
+ u += dudx*count;
+ dst += count*4;
+ }
+
+ // process post-clip region
+ if (uint32 count = mAxis.dx_postclip) {
+ uint32 offset = mSrcWidth + 1 - mRowFiltW;
+
+ memcpy(p, src+offset*4, (mRowFiltW-1)*sizeof(uint32)*4);
+ VDMemset128(p + 4*(mRowFiltW-1), src + 4*(mSrcWidth-1), mRowFiltW);
+
+ mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
+ dst += count*4;
+ }
+ }
+
+ // process post-copy region
+ if (uint32 count = mAxis.dx_postcopy) {
+ VDMemset128(dst, src + 4*(mSrcWidth-1), count);
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// VDPixmapGenResampleCol
+//
+///////////////////////////////////////////////////////////////////////////
+
+VDPixmapGenResampleCol::VDPixmapGenResampleCol()
+ : mpColStage(NULL)
+{
+}
+
+VDPixmapGenResampleCol::~VDPixmapGenResampleCol() {
+ if (mpColStage)
+ delete mpColStage;
+}
+
+void VDPixmapGenResampleCol::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 height, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
+ InitSource(src, srcIndex);
+
+ sint32 dvdy = (sint32)(step * 65536.0);
+
+ mAxis.Init(dvdy);
+
+ // construct stages
+ double y_2fc = 1.0;
+ if (!interpolationOnly && step > 1.0f)
+ y_2fc = 1.0 / step;
+
+ struct SpanRoutine {
+ uint32 mType;
+ bool mbInterpOnly;
+ nsVDPixmap::FilterMode mFilterMode;
+ uint32 mCPUFlags;
+ IVDResamplerSeparableColStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
+ };
+
+ static const SpanRoutine kSpanRoutines[]={
+#if defined _M_IX86
+ // X86
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE41, ColFactoryLinear<VDResamplerSeparableTableColStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, ColFactoryLinear<VDResamplerSeparableTableColStage8MMX> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, ColFactory<VDResamplerSeparableLinearColStageMMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE2, ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_MMX, ColFactoryLinear<VDResamplerSeparableTableColStageMMX> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE41, ColFactoryCubic<VDResamplerSeparableTableColStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, ColFactoryCubic<VDResamplerSeparableTableColStage8MMX> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE2, ColFactoryCubic2<VDResamplerSeparableCubicColStageSSE2> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, ColFactoryCubic2<VDResamplerSeparableCubicColStageMMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE2, ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_MMX, ColFactoryCubic<VDResamplerSeparableTableColStageMMX> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE41, ColFactoryLanczos3<VDResamplerSeparableTableColStage8SSE41> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_MMX, ColFactoryLanczos3<VDResamplerSeparableTableColStage8MMX> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE2, ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_MMX, ColFactoryLanczos3<VDResamplerSeparableTableColStageMMX> },
+#elif defined _M_AMD64
+ // AMD64
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, CPUF_SUPPORTS_SSE2, ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, CPUF_SUPPORTS_SSE2, ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, CPUF_SUPPORTS_SSE2, ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
+#endif
+ // Generic
+ { kVDPixType_8, true, nsVDPixmap::kFilterLinear, 0, ColFactory<VDResamplerColStageSeparableLinear8> },
+ { kVDPixType_8888, true, nsVDPixmap::kFilterLinear, 0, ColFactory<VDResamplerColStageSeparableLinear32> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLinear, 0, ColFactoryLinear<VDResamplerColStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLinear, 0, ColFactoryLinear<VDResamplerColStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterLinear, 0, ColFactoryLinear<VDResamplerColStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterLinear, 0, ColFactoryLinear<VDResamplerColStageSeparableTable32Fx4> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterCubic, 0, ColFactoryCubic<VDResamplerColStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterCubic, 0, ColFactoryCubic<VDResamplerColStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterCubic, 0, ColFactoryCubic<VDResamplerColStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterCubic, 0, ColFactoryCubic<VDResamplerColStageSeparableTable32Fx4> },
+ { kVDPixType_8, false, nsVDPixmap::kFilterLanczos3, 0, ColFactoryLanczos3<VDResamplerColStageSeparableTable8> },
+ { kVDPixType_8888, false, nsVDPixmap::kFilterLanczos3, 0, ColFactoryLanczos3<VDResamplerColStageSeparableTable32> },
+ { kVDPixType_32F_LE, false, nsVDPixmap::kFilterLanczos3, 0, ColFactoryLanczos3<VDResamplerColStageSeparableTable32F> },
+ { kVDPixType_32Fx4_LE, false, nsVDPixmap::kFilterLanczos3, 0, ColFactoryLanczos3<VDResamplerColStageSeparableTable32Fx4> },
+ };
+
+ long flags = CPUGetEnabledExtensions();
+ uint32 type = src->GetType(srcIndex) & kVDPixType_Mask;
+ for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
+ const SpanRoutine& rout = kSpanRoutines[i];
+
+ if (rout.mType != type)
+ continue;
+
+ if (rout.mbInterpOnly && y_2fc < 1.0)
+ continue;
+
+ if (rout.mFilterMode != filterMode)
+ continue;
+
+ if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
+ continue;
+
+ mpColStage = rout.mpClassFactory(y_2fc, filterFactor);
+ break;
+ }
+
+ mWinSize = mpColStage ? mpColStage->GetWindowSize() : 1;
+ mWindow.resize(mWinSize);
+
+ int delta = (mWinSize + 1) >> 1;
+ mpSrc->AddWindowRequest(-delta, delta);
+
+ sint32 fsy1 = (sint32)(offset * 65536.0) - ((mWinSize-1)<<15);
+ mAxis.Compute(height, fsy1, mSrcHeight, mWinSize);
+ mHeight = height;
+
+ switch(type) {
+ case kVDPixType_8:
+ mBytesPerSample = 1;
+ break;
+ case kVDPixType_8888:
+ case kVDPixType_32F_LE:
+ mBytesPerSample = 4;
+ break;
+ case kVDPixType_32Fx4_LE:
+ mBytesPerSample = 16;
+ break;
+
+ default:
+ VDASSERT(false);
+ }
+}
+
+void VDPixmapGenResampleCol::Start() {
+ mBytesPerRow = mWidth * mBytesPerSample;
+ StartWindow(mBytesPerRow);
+}
+
+void VDPixmapGenResampleCol::Compute(void *dst0, sint32 y) {
+ const uint32 winsize = mWinSize;
+ const uint32 dx = mSrcWidth;
+
+ y -= (sint32)mAxis.dx_precopy;
+
+ if (y < 0) {
+ const void *srcrow0 = mpSrc->GetRow(0, mSrcIndex);
+ memcpy(dst0, srcrow0, mBytesPerRow);
+ return;
+ }
+
+ uint32 midrange = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
+
+ if (y < (sint32)midrange) {
+ sint32 v = mAxis.u + mAxis.dudx * y;
+
+ if (mpColStage) {
+ for(uint32 i=0; i<winsize; ++i) {
+ int sy = (v >> 16) + i;
+
+ if ((unsigned)sy >= (unsigned)mSrcHeight)
+ sy = (~sy >> 31) & (mSrcHeight - 1);
+
+ mWindow[i] = mpSrc->GetRow(sy, mSrcIndex);
+ }
+
+ mpColStage->Process(dst0, mWindow.data(), dx, v);
+ } else
+ memcpy(dst0, mpSrc->GetRow(v >> 16, mSrcIndex), mBytesPerRow);
+ return;
+ }
+
+ const void *p = mpSrc->GetRow(mSrcHeight - 1, mSrcIndex);
+
+ memcpy(dst0, p, mBytesPerRow);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp
new file mode 100644
index 000000000..0c649dd5c
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special.cpp
@@ -0,0 +1,186 @@
+#include "uberblit_resample_special.h"
+#include "blt_spanutils.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(0, 0);
+
+ mWidth = (mSrcWidth + 1) >> 1;
+}
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_d2_p0_lin_u8::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_compress2x_coaligned((uint8 *)dst0, src, mSrcWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(0, 0);
+
+ mWidth = (mSrcWidth + 3) >> 2;
+}
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_d4_p0_lin_u8::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_compress4x_coaligned((uint8 *)dst0, src, mSrcWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(0, 0);
+
+ mWidth = mSrcWidth * 2;
+}
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_expand2x_coaligned((uint8 *)dst0, src, mWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(0, 0);
+
+ mWidth = mSrcWidth * 4;
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_expand4x_coaligned((uint8 *)dst0, src, mWidth);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(-2, 2);
+
+ mHeight = (mSrcHeight + 1) >> 1;
+}
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_x2_phalf_lin_u8::Compute(void *dst0, sint32 y) {
+ sint32 y2 = y+y;
+ const uint8 *src[4] = {
+ (const uint8 *)mpSrc->GetRow(y2 > 0 ? y2-1 : 0, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2 , mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+2, mSrcIndex)
+ };
+
+ nsVDPixmapSpanUtils::vert_compress2x_centered((uint8 *)dst0, src, mWidth, 0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(-4, 4);
+
+ mHeight = (mSrcHeight + 2) >> 2;
+}
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_x4_p1half_lin_u8::Compute(void *dst0, sint32 y) {
+ sint32 y4 = y*4;
+ const uint8 *src[8] = {
+ (const uint8 *)mpSrc->GetRow(y4 > 2 ? y4-2 : 0, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4 > 1 ? y4-1 : 0, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4 , mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4+1, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4+2, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4+3, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4+4, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y4+5, mSrcIndex)
+ };
+
+ nsVDPixmapSpanUtils::vert_compress4x_centered((uint8 *)dst0, src, mWidth, 0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(-1, 1);
+
+ mHeight = mSrcHeight * 2;
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8::Compute(void *dst0, sint32 y) {
+ sint32 y2 = (y - 1) >> 1;
+ const uint8 *src[2] = {
+ (const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+ };
+
+ nsVDPixmapSpanUtils::vert_expand2x_centered((uint8 *)dst0, src, mWidth, ~y << 7);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Init(IVDPixmapGen *src, uint32 srcIndex) {
+ InitSource(src, srcIndex);
+ src->AddWindowRequest(-1, 1);
+
+ mHeight = mSrcHeight * 4;
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Start() {
+ mpSrc->Start();
+ StartWindow(mWidth);
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8::Compute(void *dst0, sint32 y) {
+ sint32 y2 = (y - 2) >> 2;
+ const uint8 *src[2] = {
+ (const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+ };
+
+ nsVDPixmapSpanUtils::vert_expand4x_centered((uint8 *)dst0, src, mWidth, (y - 2) << 6);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp
new file mode 100644
index 000000000..b1828fcca
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_resample_special_x86.cpp
@@ -0,0 +1,35 @@
+#include "uberblit_resample_special_x86.h"
+#include "blt_spanutils.h"
+#include "blt_spanutils_x86.h"
+
+void VDPixmapGenResampleRow_x2_p0_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_expand2x_coaligned_ISSE((uint8 *)dst0, src, mWidth);
+}
+
+void VDPixmapGenResampleRow_x4_p0_lin_u8_MMX::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ nsVDPixmapSpanUtils::horiz_expand4x_coaligned_MMX((uint8 *)dst0, src, mWidth);
+}
+
+void VDPixmapGenResampleCol_d2_pnqrtr_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+ sint32 y2 = (y - 1) >> 1;
+ const uint8 *src[2] = {
+ (const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+ };
+
+ nsVDPixmapSpanUtils::vert_expand2x_centered_ISSE((uint8 *)dst0, src, mWidth, ~y << 7);
+}
+
+void VDPixmapGenResampleCol_d4_pn38_lin_u8_ISSE::Compute(void *dst0, sint32 y) {
+ sint32 y2 = (y - 2) >> 2;
+ const uint8 *src[2] = {
+ (const uint8 *)mpSrc->GetRow(y2, mSrcIndex),
+ (const uint8 *)mpSrc->GetRow(y2+1, mSrcIndex),
+ };
+
+ nsVDPixmapSpanUtils::vert_expand4x_centered_ISSE((uint8 *)dst0, src, mWidth, (y - 2) << 6);
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp
new file mode 100644
index 000000000..4cb5e4409
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle.cpp
@@ -0,0 +1,89 @@
+#include "uberblit_swizzle.h"
+
+void VDPixmapGen_Swap8In16::Init(IVDPixmapGen *gen, int srcIndex, uint32 w, uint32 h, uint32 bpr) {
+ InitSource(gen, srcIndex);
+ mRowLength = bpr;
+ SetOutputSize(w, h);
+ gen->AddWindowRequest(0, 0);
+}
+
+void VDPixmapGen_Swap8In16::Start() {
+ StartWindow(mRowLength);
+}
+
+uint32 VDPixmapGen_Swap8In16::GetType(uint32 index) const {
+ return mpSrc->GetType(mSrcIndex);
+}
+
+void VDPixmapGen_Swap8In16::Compute(void *dst0, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+ uint8 *dst = (uint8 *)dst0;
+ sint32 w = mRowLength;
+
+ uint32 n4 = w >> 2;
+
+ for(uint32 i=0; i<n4; ++i) {
+ uint32 p = *(uint32 *)src;
+ src += 4;
+
+ uint32 r = ((p & 0xff00ff00) >> 8) + ((p & 0x00ff00ff) << 8);
+
+ *(uint32 *)dst = r;
+ dst += 4;
+ }
+
+ if (w & 2) {
+ dst[0] = src[1];
+ dst[1] = src[0];
+ dst += 2;
+ src += 2;
+ }
+
+ if (w & 1) {
+ *dst = *src;
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_B8x2_To_B8R8::Init(IVDPixmapGen *srcCb, uint32 srcindexCb, IVDPixmapGen *srcCr, uint32 srcindexCr) {
+ mpSrcCb = srcCb;
+ mSrcIndexCb = srcindexCb;
+ mpSrcCr = srcCr;
+ mSrcIndexCr = srcindexCr;
+ mWidth = srcCb->GetWidth(srcindexCb);
+ mHeight = srcCb->GetHeight(srcindexCb);
+
+ srcCb->AddWindowRequest(0, 0);
+ srcCr->AddWindowRequest(0, 0);
+}
+
+void VDPixmapGen_B8x2_To_B8R8::Start() {
+ mpSrcCb->Start();
+ mpSrcCr->Start();
+
+ StartWindow(mWidth * 2);
+}
+
+uint32 VDPixmapGen_B8x2_To_B8R8::GetType(uint32 output) const {
+ return (mpSrcCb->GetType(mSrcIndexCb) & ~kVDPixType_Mask) | kVDPixType_B8R8;
+}
+
+void VDPixmapGen_B8x2_To_B8R8::Compute(void *dst0, sint32 y) {
+ uint8 *VDRESTRICT dst = (uint8 *)dst0;
+ const uint8 *VDRESTRICT srcCb = (const uint8 *)mpSrcCb->GetRow(y, mSrcIndexCb);
+ const uint8 *VDRESTRICT srcCr = (const uint8 *)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+ sint32 w = mWidth;
+ for(sint32 x=0; x<w; ++x) {
+ uint8 cb = srcCb[0];
+ uint8 cr = srcCr[0];
+
+ dst[0] = cb;
+ dst[1] = cr;
+
+ ++srcCb;
+ ++srcCr;
+ dst += 2;
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp
new file mode 100644
index 000000000..3a87d5a68
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_swizzle_x86.cpp
@@ -0,0 +1,400 @@
+#include "uberblit_swizzle_x86.h"
+
+#ifdef VD_COMPILER_MSVC
+ #pragma warning(disable: 4799) // warning C4799: function 'vdasm_extract_8in16_even_MMX' has no EMMS instruction
+#endif
+
+void __declspec(naked) __fastcall vdasm_extract_8in16_even_MMX(void *dst, const void *src, uint32 count) {
+ __asm {
+ mov eax, [esp+4]
+ pcmpeqb mm2, mm2
+ psrlw mm2, 8
+ sub eax, 8
+ jc xtra
+xloop:
+ movq mm0, [edx]
+ movq mm1, [edx+8]
+ pand mm0, mm2
+ pand mm1, mm2
+ packuswb mm0, mm1
+ add edx, 16
+ movq [ecx], mm0
+ add ecx, 8
+ sub eax, 8
+ jns xloop
+xtra:
+ add eax, 8
+ jz fin
+ push ebx
+xtraloop:
+ mov bl, [edx]
+ add edx, 2
+ mov [ecx], bl
+ add ecx, 1
+ sub eax, 1
+ jnz xtraloop
+
+ pop ebx
+fin:
+ ret 4
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_extract_8in16_odd_MMX(void *dst, const void *src, uint32 count) {
+ __asm {
+ mov eax, [esp+4]
+ sub eax, 8
+ jc xtra
+xloop:
+ movq mm0, [edx]
+ movq mm1, [edx+8]
+ psrlw mm0, 8
+ psrlw mm1, 8
+ add edx, 16
+ packuswb mm0, mm1
+ movq [ecx], mm0
+ add ecx, 8
+ sub eax, 8
+ jns xloop
+xtra:
+ add eax, 8
+ jz fin
+ push ebx
+xtraloop:
+ mov bl, [edx+1]
+ add edx, 2
+ mov [ecx], bl
+ add ecx, 1
+ sub eax, 1
+ jnz xtraloop
+
+ pop ebx
+fin:
+ ret 4
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_extract_8in32_MMX(void *dst, const void *src, uint32 count, int byteshift) {
+ __asm {
+ movd mm4, [esp+8]
+ pcmpeqb mm5, mm5
+ pslld mm4, 3
+ mov eax, [esp+4]
+ psrld mm5, 24
+ sub eax, 8
+ jc xtra
+xloop:
+ movq mm0, [edx]
+ movq mm1, [edx+8]
+ psrld mm0, mm4
+ movq mm2, [edx+16]
+ psrld mm1, mm4
+ pand mm0, mm5
+ movq mm3, [edx+24]
+ psrld mm2, mm4
+ pand mm1, mm5
+ packssdw mm0, mm1
+ psrld mm3, mm4
+ pand mm2, mm5
+ pand mm3, mm5
+ add edx, 32
+ packssdw mm2, mm3
+ packuswb mm0, mm2
+ movq [ecx], mm0
+ add ecx, 8
+ sub eax, 8
+ jns xloop
+xtra:
+ add eax, 8
+ jz fin
+ add edx, dword ptr [esp+8]
+ push ebx
+xtraloop:
+ mov bl, [edx]
+ add edx, 4
+ mov [ecx], bl
+ add ecx, 1
+ sub eax, 1
+ jnz xtraloop
+
+ pop ebx
+fin:
+ ret 8
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_swap_8in16_MMX(void *dst, const void *src, uint32 count) {
+ __asm {
+ mov eax, [esp+4]
+ sub eax, 8
+ js xtra
+xloop:
+ movq mm0, [edx]
+ add edx, 8
+ movq mm1, mm0
+ psllw mm0, 8
+ psrlw mm1, 8
+ paddb mm0, mm1
+ movq [ecx], mm0
+ add ecx, 8
+ sub eax, 8
+ jns xloop
+xtra:
+ add eax, 6
+ js nopairs
+ push ebx
+pairloop:
+ mov bl, [edx]
+ mov bh, [edx+1]
+ add edx, 2
+ mov [ecx], bh
+ mov [ecx+1], bl
+ add ecx, 2
+ sub eax, 2
+ jns pairloop
+ pop ebx
+nopairs:
+ add eax, 2
+ jz noodd
+ mov al, [edx]
+ mov [ecx], al
+noodd:
+ ret 4
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_BGRG_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
+ __asm {
+ push edi
+ push esi
+ push ebx
+ mov esi, [esp+12+12]
+ mov edi, [esp+8+12]
+ mov ebx, [esp+4+12]
+ sub esi, 4
+ jc xtra
+ ; ecx = dst
+ ; edx = srcR
+ ; ebx = srcG
+ ; edi = srcB
+xloop:
+ movd mm0, [edi]
+ movd mm1, [edx]
+ punpcklbw mm0, mm1
+ movq mm1, [ebx]
+ movq mm2, mm0
+ punpcklbw mm0, mm1
+ add edx, 4
+ punpckhbw mm2, mm1
+ add edi, 4
+ movq [ecx], mm0
+ add ebx, 8
+ movq [ecx+8], mm2
+ add ecx, 16
+ sub esi, 4
+ jns xloop
+xtra:
+ add esi, 4
+ jz fin
+xtraloop:
+ mov al, [edi]
+ mov [ecx], al
+ mov al, [ebx]
+ mov [ecx+1], al
+ mov al, [edx]
+ mov [ecx+2], al
+ mov al, [ebx+1]
+ mov [ecx+3], al
+ add ebx, 2
+ add edx, 1
+ add edi, 1
+ add ecx, 4
+ sub esi, 1
+ jnz xtraloop
+fin:
+ pop ebx
+ pop esi
+ pop edi
+ ret 12
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_GBGR_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
+ __asm {
+ push edi
+ push esi
+ push ebx
+ mov esi, [esp+12+12]
+ mov edi, [esp+8+12]
+ mov ebx, [esp+4+12]
+ sub esi, 4
+ jc xtra
+ ; ecx = dst
+ ; edx = srcR
+ ; ebx = srcG
+ ; edi = srcB
+xloop:
+ movd mm0, [edi]
+ movd mm1, [edx]
+ punpcklbw mm0, mm1
+ movq mm2, [ebx]
+ movq mm1, mm2
+ punpcklbw mm2, mm0
+ add edx, 4
+ punpckhbw mm1, mm0
+ add edi, 4
+ movq [ecx], mm2
+ add ebx, 8
+ movq [ecx+8], mm1
+ add ecx, 16
+ sub esi, 4
+ jns xloop
+xtra:
+ add esi, 4
+ jz fin
+xtraloop:
+ mov al, [ebx]
+ mov [ecx], al
+ mov al, [edi]
+ mov [ecx+1], al
+ mov al, [ebx+1]
+ mov [ecx+2], al
+ mov al, [edx]
+ mov [ecx+3], al
+ add ebx, 2
+ add edx, 1
+ add edi, 1
+ add ecx, 4
+ sub esi, 1
+ jnz xtraloop
+fin:
+ pop ebx
+ pop esi
+ pop edi
+ ret 12
+ }
+}
+
+void __declspec(naked) __fastcall vdasm_interleave_BR_MMX(void *dst, const void *srcB, const void *srcR, uint32 count) {
+ __asm {
+ push edi
+ push esi
+ push ebx
+ mov esi, [esp+8+12]
+ mov ebx, [esp+4+12]
+ sub esi, 8
+ jc xtra
+ ; ecx = dst
+ ; edx = srcB
+ ; ebx = srcG
+xloop:
+ movq mm0, [edx]
+ movq mm1, [ebx]
+ movq mm2, mm0
+ punpcklbw mm0, mm1
+ punpckhbw mm2, mm1
+ add edx, 8
+ movq [ecx], mm0
+ add ebx, 8
+ movq [ecx+8], mm2
+ add ecx, 16
+ sub esi, 8
+ jns xloop
+xtra:
+ add esi, 8
+ jz fin
+xtraloop:
+ mov al, [edx]
+ mov [ecx], al
+ mov al, [ebx]
+ mov [ecx+1], al
+ add ebx, 1
+ add edx, 1
+ add ecx, 2
+ sub esi, 1
+ jnz xtraloop
+fin:
+ pop ebx
+ pop esi
+ pop edi
+ ret 8
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_8In16_Even_MMX::Compute(void *dst, sint32 y) {
+ const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ vdasm_extract_8in16_even_MMX(dst, srcp, mWidth);
+}
+
+void VDPixmapGen_8In16_Odd_MMX::Compute(void *dst, sint32 y) {
+ const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ vdasm_extract_8in16_odd_MMX(dst, srcp, mWidth);
+}
+
+void VDPixmapGen_8In32_MMX::Compute(void *dst, sint32 y) {
+ const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ vdasm_extract_8in32_MMX(dst, srcp, mWidth, mOffset);
+}
+
+void VDPixmapGen_Swap8In16_MMX::Compute(void *dst, sint32 y) {
+ const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ vdasm_swap_8in16_MMX(dst, src, mRowLength);
+}
+
+void VDPixmapGen_B8x2_To_B8R8_MMX::Compute(void *dst0, sint32 y) {
+ uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+ const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+ const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+ vdasm_interleave_BR_MMX(dst, srcCb, srcCr, mWidth);
+}
+
+void VDPixmapGen_B8x3_To_G8B8_G8R8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
+ uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+ const uint8 *VDRESTRICT srcY = (const uint8 *VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
+ const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+ const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+ vdasm_interleave_GBGR_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
+
+ if (mWidth & 1) {
+ int w2 = mWidth >> 1;
+ srcY += mWidth;
+ srcCb += w2;
+ srcCr += w2;
+ dst += mWidth * 2;
+
+ dst[-2] = srcY[-1];
+ dst[-1] = srcCb[0];
+ dst[ 0] = 0; // must be zero for QuickTime compatibility
+ dst[ 1] = srcCr[0];
+ }
+}
+
+void VDPixmapGen_B8x3_To_B8G8_R8G8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
+ uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
+ const uint8 *VDRESTRICT srcY = (const uint8 * VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
+ const uint8 *VDRESTRICT srcCb = (const uint8 * VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
+ const uint8 *VDRESTRICT srcCr = (const uint8 * VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
+
+ vdasm_interleave_BGRG_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
+
+ if (mWidth & 1) {
+ int w2 = mWidth >> 1;
+ srcY += mWidth;
+ srcCb += w2;
+ srcCr += w2;
+ dst += mWidth * 2;
+
+ dst[-2] = srcCb[0];
+ dst[-1] = srcY[-1];
+ dst[ 0] = srcCr[0];
+ dst[ 1] = 0; // must be zero for QuickTime compatibility
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp
new file mode 100644
index 000000000..78793f477
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_v210.cpp
@@ -0,0 +1,199 @@
+#include <vd2/system/halffloat.h>
+#include <vd2/system/math.h>
+#include "uberblit_v210.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_32F_To_V210::Compute(void *dst0, sint32 y) {
+ uint32 *dst = (uint32 *)dst0;
+ const float *srcR = (const float *)mpSrcR->GetRow(y, mSrcIndexR);
+ const float *srcG = (const float *)mpSrcG->GetRow(y, mSrcIndexG);
+ const float *srcB = (const float *)mpSrcB->GetRow(y, mSrcIndexB);
+
+ VDCPUCleanupExtensions();
+
+ int w6 = mWidth / 6;
+ for(sint32 i=0; i<w6; ++i) {
+ float r0 = srcR[0];
+ float r1 = srcR[1];
+ float r2 = srcR[2];
+ srcR += 3;
+
+ float b0 = srcB[0];
+ float b1 = srcB[1];
+ float b2 = srcB[2];
+ srcB += 3;
+
+ float g0 = srcG[0];
+ float g1 = srcG[1];
+ float g2 = srcG[2];
+ float g3 = srcG[3];
+ float g4 = srcG[4];
+ float g5 = srcG[5];
+ srcG += 6;
+
+ if (r0 < 0.0f) r0 = 0.0f; else if (r0 > 1.0f) r0 = 1.0f;
+ if (r1 < 0.0f) r1 = 0.0f; else if (r1 > 1.0f) r1 = 1.0f;
+ if (r2 < 0.0f) r2 = 0.0f; else if (r2 > 1.0f) r2 = 1.0f;
+ if (g0 < 0.0f) g0 = 0.0f; else if (g0 > 1.0f) g0 = 1.0f;
+ if (g1 < 0.0f) g1 = 0.0f; else if (g1 > 1.0f) g1 = 1.0f;
+ if (g2 < 0.0f) g2 = 0.0f; else if (g2 > 1.0f) g2 = 1.0f;
+ if (g3 < 0.0f) g3 = 0.0f; else if (g3 > 1.0f) g3 = 1.0f;
+ if (g4 < 0.0f) g4 = 0.0f; else if (g4 > 1.0f) g4 = 1.0f;
+ if (g5 < 0.0f) g5 = 0.0f; else if (g5 > 1.0f) g5 = 1.0f;
+ if (b0 < 0.0f) b0 = 0.0f; else if (b0 > 1.0f) b0 = 1.0f;
+ if (b1 < 0.0f) b1 = 0.0f; else if (b1 > 1.0f) b1 = 1.0f;
+ if (b2 < 0.0f) b2 = 0.0f; else if (b2 > 1.0f) b2 = 1.0f;
+
+ uint32 ir0 = (uint32)VDRoundToIntFast(r0 * 1024.0f);
+ uint32 ir1 = (uint32)VDRoundToIntFast(r1 * 1024.0f);
+ uint32 ir2 = (uint32)VDRoundToIntFast(r2 * 1024.0f);
+ uint32 ib0 = (uint32)VDRoundToIntFast(b0 * 1024.0f);
+ uint32 ib1 = (uint32)VDRoundToIntFast(b1 * 1024.0f);
+ uint32 ib2 = (uint32)VDRoundToIntFast(b2 * 1024.0f);
+ uint32 ig0 = (uint32)VDRoundToIntFast(g0 * 1024.0f);
+ uint32 ig1 = (uint32)VDRoundToIntFast(g1 * 1024.0f);
+ uint32 ig2 = (uint32)VDRoundToIntFast(g2 * 1024.0f);
+ uint32 ig3 = (uint32)VDRoundToIntFast(g3 * 1024.0f);
+ uint32 ig4 = (uint32)VDRoundToIntFast(g4 * 1024.0f);
+ uint32 ig5 = (uint32)VDRoundToIntFast(g5 * 1024.0f);
+
+ // dword 0: XX Cr0 Y0 Cb0
+ // dword 1: XX Y2 Cb1 Y1
+ // dword 2: XX Cb2 Y3 Cr1
+ // dword 3: XX Y5 Cr2 Y4
+ dst[0] = (ir0 << 20) + (ig0 << 10) + ib0;
+ dst[1] = (ig2 << 20) + (ib1 << 10) + ig1;
+ dst[2] = (ib2 << 20) + (ig3 << 10) + ir1;
+ dst[3] = (ig5 << 20) + (ir2 << 10) + ig4;
+
+ dst += 4;
+ }
+
+ int leftovers = mWidth - w6*6;
+ if (leftovers) {
+ float g0 = 0;
+ float g1 = 0;
+ float g2 = 0;
+ float g3 = 0;
+ float g4 = 0;
+ float r0 = 0;
+ float r1 = 0;
+ float r2 = 0;
+ float b0 = 0;
+ float b1 = 0;
+ float b2 = 0;
+
+ switch(leftovers) {
+ case 5: r2 = srcR[2];
+ b2 = srcB[2];
+ g4 = srcG[4];
+ case 4: g3 = srcG[3];
+ case 3: r1 = srcR[1];
+ b1 = srcB[1];
+ g2 = srcG[2];
+ case 2: g1 = srcG[1];
+ case 1: r0 = srcR[0];
+ b0 = srcB[0];
+ g0 = srcG[0];
+ }
+
+ if (r0 < 0.0f) r0 = 0.0f; else if (r0 > 1.0f) r0 = 1.0f;
+ if (r1 < 0.0f) r1 = 0.0f; else if (r1 > 1.0f) r1 = 1.0f;
+ if (r2 < 0.0f) r2 = 0.0f; else if (r2 > 1.0f) r2 = 1.0f;
+ if (g0 < 0.0f) g0 = 0.0f; else if (g0 > 1.0f) g0 = 1.0f;
+ if (g1 < 0.0f) g1 = 0.0f; else if (g1 > 1.0f) g1 = 1.0f;
+ if (g2 < 0.0f) g2 = 0.0f; else if (g2 > 1.0f) g2 = 1.0f;
+ if (g3 < 0.0f) g3 = 0.0f; else if (g3 > 1.0f) g3 = 1.0f;
+ if (g4 < 0.0f) g4 = 0.0f; else if (g4 > 1.0f) g4 = 1.0f;
+ if (b0 < 0.0f) b0 = 0.0f; else if (b0 > 1.0f) b0 = 1.0f;
+ if (b1 < 0.0f) b1 = 0.0f; else if (b1 > 1.0f) b1 = 1.0f;
+ if (b2 < 0.0f) b2 = 0.0f; else if (b2 > 1.0f) b2 = 1.0f;
+
+ uint32 ir0 = (uint32)VDRoundToIntFast(r0 * 1024.0f);
+ uint32 ir1 = (uint32)VDRoundToIntFast(r1 * 1024.0f);
+ uint32 ir2 = (uint32)VDRoundToIntFast(r2 * 1024.0f);
+ uint32 ib0 = (uint32)VDRoundToIntFast(b0 * 1024.0f);
+ uint32 ib1 = (uint32)VDRoundToIntFast(b1 * 1024.0f);
+ uint32 ib2 = (uint32)VDRoundToIntFast(b2 * 1024.0f);
+ uint32 ig0 = (uint32)VDRoundToIntFast(g0 * 1024.0f);
+ uint32 ig1 = (uint32)VDRoundToIntFast(g1 * 1024.0f);
+ uint32 ig2 = (uint32)VDRoundToIntFast(g2 * 1024.0f);
+ uint32 ig3 = (uint32)VDRoundToIntFast(g3 * 1024.0f);
+ uint32 ig4 = (uint32)VDRoundToIntFast(g4 * 1024.0f);
+
+ // dword 0: XX Cr0 Y0 Cb0
+ // dword 1: XX Y2 Cb1 Y1
+ // dword 2: XX Cb2 Y3 Cr1
+ // dword 3: XX Y5 Cr2 Y4
+ dst[0] = (ir0 << 20) + (ig0 << 10) + ib0;
+ dst[1] = (ig2 << 20) + (ib1 << 10) + ig1;
+ dst[2] = (ib2 << 20) + (ig3 << 10) + ir1;
+ dst[3] = (ir2 << 10) + ig4;
+ dst += 4;
+ }
+
+ // QuickTime defines the v210 format and requires zero padding in all unused samples.
+ int w48up = (mWidth + 23) / 24;
+ int w6up = (mWidth + 5) / 6;
+ int zeropad = w48up * 16 - w6up * 4;
+ memset(dst, 0, zeropad * 4);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void VDPixmapGen_V210_To_32F::Start() {
+ StartWindow(((mWidth + 5) / 6) * 6 * sizeof(float), 3);
+}
+
+const void *VDPixmapGen_V210_To_32F::GetRow(sint32 y, uint32 index) {
+ return (const uint8 *)VDPixmapGenWindowBasedOneSource::GetRow(y, index) + mWindowPitch * index;
+}
+
+sint32 VDPixmapGen_V210_To_32F::GetWidth(int index) const {
+ return index == 1 ? mWidth : (mWidth + 1) >> 1;
+}
+
+uint32 VDPixmapGen_V210_To_32F::GetType(uint32 output) const {
+ return (mpSrc->GetType(mSrcIndex) & ~kVDPixType_Mask) | kVDPixType_32F_LE;
+}
+
+void VDPixmapGen_V210_To_32F::Compute(void *dst0, sint32 y) {
+ float *dstR = (float *)dst0;
+ float *dstG = (float *)((char *)dstR + mWindowPitch);
+ float *dstB = (float *)((char *)dstG + mWindowPitch);
+ const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
+ uint32 w = (mWidth + 5) / 6;
+
+ VDCPUCleanupExtensions();
+
+ // dword 0: XX Cr0 Y0 Cb0
+ // dword 1: XX Y2 Cb1 Y1
+ // dword 2: XX Cb2 Y3 Cr1
+ // dword 3: XX Y5 Cr2 Y4
+
+ for(uint32 i=0; i<w; ++i) {
+ const uint32 w0 = src[0];
+ const uint32 w1 = src[1];
+ const uint32 w2 = src[2];
+ const uint32 w3 = src[3];
+ src += 4;
+
+ dstB[0] = (float)( w0 & 0x3ff) / 1023.0f;
+ dstG[0] = (float)((w0 >> 10) & 0x3ff) / 1023.0f;
+ dstR[0] = (float)((w0 >> 20) & 0x3ff) / 1023.0f;
+ dstG[1] = (float)( w1 & 0x3ff) / 1023.0f;
+ dstB[1] = (float)((w1 >> 10) & 0x3ff) / 1023.0f;
+ dstG[2] = (float)((w1 >> 20) & 0x3ff) / 1023.0f;
+ dstR[1] = (float)( w2 & 0x3ff) / 1023.0f;
+ dstG[3] = (float)((w2 >> 10) & 0x3ff) / 1023.0f;
+ dstB[2] = (float)((w2 >> 20) & 0x3ff) / 1023.0f;
+ dstG[4] = (float)( w3 & 0x3ff) / 1023.0f;
+ dstR[2] = (float)((w3 >> 10) & 0x3ff) / 1023.0f;
+ dstG[5] = (float)((w3 >> 20) & 0x3ff) / 1023.0f;
+
+ dstR += 3;
+ dstG += 6;
+ dstB += 3;
+ }
+}
diff --git a/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp
new file mode 100644
index 000000000..d34f731f1
--- /dev/null
+++ b/src/thirdparty/VirtualDub/Kasumi/source/uberblit_ycbcr_x86.cpp
@@ -0,0 +1,35 @@
+#include "uberblit_ycbcr_x86.h"
+
+extern "C" void vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2(void *dstY, void *dstCb, void *dstCr, const void *srcRGB, uint32 count, const void *coeffs);
+
+void VDPixmapGenRGB32ToYCbCr601_SSE2::Compute(void *dst0, sint32 y) {
+ uint8 *dstCb = (uint8 *)dst0;
+ uint8 *dstY = dstCb + mWindowPitch;
+ uint8 *dstCr = dstY + mWindowPitch;
+ const uint8 *srcRGB = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
+
+ static const __declspec(align(16)) struct {
+ sint16 rb_to_y[8];
+ sint16 rb_to_cb[8];
+ sint16 rb_to_cr[8];
+ sint16 g_to_y[8];
+ sint16 g_to_cb[8];
+ sint16 g_to_cr[8];
+ sint32 y_bias[4];
+ sint32 c_bias[4];
+ } kCoeffs={
+ // Cb = (28784*r - 24103*g - 4681*b + 8388608 + 32768) >> 16;
+ // Y = (16829*r + 33039*g + 6416*b + 1048576 + 32768) >> 16;
+ // Cr = (-9714*r - 19071*g + 28784*b + 8388608 + 32768) >> 16;
+ { 3208, 8414, 3208, 8414, 3208, 8414, 3208, 8414, }, // rb to y
+ { -2340, 14392, -2340, 14392, -2340, 14392, -2340, 14392, }, // rb to cb
+ { 16519, 0, 16519, 0, 16519, 0, 16519, 0, }, // g to y
+ { -12050, 0, -12050, 0, -12050, 0, -12050, 0, }, // g to cb
+ { 14392, -4857, 14392, -4857, 14392, -4857, 14392, -4857, }, // rb to cr
+ { -9535, 0, -9535, 0, -9535, 0, -9535, 0, }, // g to cr
+ { 0x084000, 0x084000, 0x084000, 0x084000, }, // y bias
+ { 0x404000, 0x404000, 0x404000, 0x404000, }, // c bias
+ };
+
+ vdasm_pixblt_XRGB8888_to_YUV444Planar_scan_SSE2(dstY, dstCb, dstCr, srcRGB, mWidth, &kCoeffs);
+}