Welcome to mirror list, hosted at ThFree Co, Russian Federation.

a_triblt_mmx.asm « source « Kasumi « VirtualDub « thirdparty « src - github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 3836488aa79df3bba13461e85da19a644e204a97 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
		segment	.rdata, align=16

correct		dq			0000800000008000h
round		dq			0000200000002000h
round1		dq			0000020000000200h
round2		dq			0002000000020000h

		segment	.text

		%include	"a_triblt.inc"

		extern		_kVDCubicInterpTableFX14_075_MMX

;--------------------------------------------------------------------------
	global	_vdasm_triblt_span_bilinear_mmx
_vdasm_triblt_span_bilinear_mmx:
		push		ebp
		push		edi
		push		esi
		push		ebx
		mov			edi,[esp+4+16]
		mov			edx,[edi+texinfo.dst]
		mov			ebp,[edi+texinfo.w]
		shl			ebp,2
		mov			ebx,[edi+texinfo.mips+mipmap.bits]
		add			edx,ebp
		mov			esi,[edi+texinfo.mips+mipmap.pitch]
		neg			ebp
		movd		mm6,[edi+texinfo.mips+mipmap.uvmul]
		pxor		mm7,mm7
		mov			edi,[edi+texinfo.src]
.xloop:
		movq		mm4,[edi]
		movq		mm0,mm4
		psrld		mm0,16
		movq		mm5,mm4
		packssdw	mm0,mm0
		pmaddwd		mm0,mm6
		add			edi,8
		punpcklwd	mm4,mm4
		punpckldq	mm4,mm4
		movd		ecx,mm0
		add			ecx,ebx
		psrlw		mm4,1
		movd		mm0,dword [ecx]
		movd		mm1,dword [ecx+4]
		punpcklbw	mm0,mm7
		movd		mm2,dword [ecx+esi]
		punpcklbw	mm1,mm7
		movd		mm3,dword [ecx+esi+4]
		punpcklbw	mm2,mm7
		punpcklbw	mm3,mm7
		psubw		mm1,mm0
		psubw		mm3,mm2
		paddw		mm1,mm1
		paddw		mm3,mm3
		pmulhw		mm1,mm4
		pmulhw		mm3,mm4
		punpckhwd	mm5,mm5
		punpckldq	mm5,mm5
		paddw		mm0,mm1
		psrlw		mm5,1
		paddw		mm2,mm3
		psubw		mm2,mm0
		paddw		mm2,mm2
		pmulhw		mm2,mm5
		paddw		mm0,mm2
		packuswb	mm0,mm0
		movd		dword [edx+ebp],mm0
		add			ebp,4
		jnc			.xloop
		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		emms
		ret
		
;--------------------------------------------------------------------------
	global	_vdasm_triblt_span_trilinear_mmx
_vdasm_triblt_span_trilinear_mmx:
		push		ebp
		push		edi
		push		esi
		push		ebx
		mov			esi,[esp+4+16]
		mov			edx,[esi+texinfo.dst]
		mov			ebp,[esi+texinfo.w]
		shl			ebp,2
		add			edx,ebp
		neg			ebp
		mov			edi,[esi+texinfo.src]
		pxor		mm7,mm7
.xloop:
		movd		mm6,[edi+mipspan.u]
		punpckldq	mm6,[edi+mipspan.v]
		mov			eax,[edi+mipspan.lambda]
		shr			eax,4
		and			eax,byte -16
		movd		mm2,eax
		psrlq		mm2,4
		psrld		mm6,mm2
		paddd		mm6,[correct]

		;fetch mipmap 1
		mov			ebx,[esi+eax+mipmap.pitch]
		movd		mm1,[esi+eax+mipmap.uvmul]
		movq		mm4,mm6
		movq		mm0,mm6
		psrld		mm0,16
		packssdw	mm0,mm0
		pmaddwd		mm0,mm1
		movq		mm5,mm4
		punpcklwd	mm4,mm4
		punpckldq	mm4,mm4
		punpckhwd	mm5,mm5
		punpckldq	mm5,mm5
		movd		ecx,mm0
		add			ecx,[esi+eax+mipmap.bits]
		psrlw		mm4,1
		movd		mm0,dword [ecx]
		movd		mm1,dword [ecx+4]
		punpcklbw	mm0,mm7
		movd		mm2,dword [ecx+ebx]
		punpcklbw	mm1,mm7
		movd		mm3,dword [ecx+ebx+4]
		punpcklbw	mm2,mm7
		punpcklbw	mm3,mm7
		psubw		mm1,mm0
		psubw		mm3,mm2
		paddw		mm1,mm1
		paddw		mm3,mm3
		pmulhw		mm1,mm4
		pmulhw		mm3,mm4
		paddw		mm0,mm1
		psrlw		mm5,1
		paddw		mm2,mm3
		psubw		mm2,mm0
		paddw		mm2,mm2
		pmulhw		mm2,mm5
		paddw		mm0,mm2

		;fetch mipmap 2
		mov			ebx,[esi+eax+16+mipmap.pitch]
		movd		mm1,[esi+eax+16+mipmap.uvmul]
		paddd		mm6,[correct]
		psrld		mm6,1
		movq		mm4,mm6
		psrld		mm6,16
		packssdw	mm6,mm6
		pmaddwd		mm6,mm1
		movq		mm5,mm4
		punpcklwd	mm4,mm4
		punpckldq	mm4,mm4
		punpckhwd	mm5,mm5
		punpckldq	mm5,mm5
		movd		ecx,mm6
		add			ecx,[esi+eax+16+mipmap.bits]
		psrlw		mm4,1
		movd		mm6,dword [ecx]
		movd		mm1,dword [ecx+4]
		punpcklbw	mm6,mm7
		movd		mm2,dword [ecx+ebx]
		punpcklbw	mm1,mm7
		movd		mm3,dword [ecx+ebx+4]
		punpcklbw	mm2,mm7
		punpcklbw	mm3,mm7
		psubw		mm1,mm6
		psubw		mm3,mm2
		paddw		mm1,mm1
		paddw		mm3,mm3
		pmulhw		mm1,mm4
		pmulhw		mm3,mm4
		paddw		mm6,mm1
		psrlw		mm5,1
		paddw		mm2,mm3
		psubw		mm2,mm6
		paddw		mm2,mm2
		pmulhw		mm2,mm5
		paddw		mm6,mm2

		;blend mips
		movd		mm1,[edi+mipspan.lambda]
		punpcklwd	mm1,mm1
		punpckldq	mm1,mm1
		psllw		mm1,8
		psrlq		mm1,1
		psubw		mm6,mm0
		paddw		mm6,mm6
		pmulhw		mm6,mm1
		paddw		mm0,mm6
		packuswb	mm0,mm0

		movd		dword [edx+ebp],mm0
		add			edi, mipspan_size
		add			ebp,4
		jnc			.xloop
		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		emms
		ret

;--------------------------------------------------------------------------
%macro .SETUPADDR 1
		;compute mipmap index and UV
		movd		mm0, [edi + mipspan.u]
		punpckldq	mm0, [edi + mipspan.v]
		mov			ebx, [edi + mipspan.lambda]
		shr			ebx, 4
		and			ebx, byte -16
		
		add			ebx, mipmap_size*%1
		movd		mm2, ebx
		add			ebx, [esp + .af_mipbase]
		psrlq		mm2, 4
		psrad		mm0, mm2
		paddd		mm0, [correct]
		movq		mm1, mm0
		psrlq		mm1, 32

		;compute horizontal filters
		movd		ecx, mm0
		shr			ecx, 4
		and			ecx, 0ff0h
		add			ecx, _kVDCubicInterpTableFX14_075_MMX
		
		;compute vertical filter
		movd		edx, mm1
		and			edx, 0ff00h
		shr			edx, 4
		add			edx, _kVDCubicInterpTableFX14_075_MMX

		;compute texel address
		movd		mm1, [ebx + mipmap.uvmul]
		psrld		mm0, 16
		packssdw	mm0, mm0
		pmaddwd		mm0, mm1
		movd		eax, mm0
		add			eax, [ebx + mipmap.bits]
%endmacro
		
%macro .HCUBIC 4
		movd		%1, dword [eax]
		punpcklbw	%1, qword [eax+4]
		movd		%3, dword [eax+8]
		punpcklbw	%3, qword [eax+12]
		movq		%2, %1
		movq		%4, %3
		punpcklbw	%1, mm7
		pmaddwd		%1, [ecx]
		punpcklbw	%3, mm7
		pmaddwd		%3, [ecx+8]
		punpckhbw	%2, mm7
		pmaddwd		%2, [ecx]
		punpckhbw	%4, mm7
		pmaddwd		%4, [ecx+8]
		paddd		%1, %3
		paddd		%2, %4
%endmacro

%macro	.VCUBIC		1
		.HCUBIC		mm0, mm1, mm2, mm3
		add			eax, %1

		.HCUBIC		mm4, mm5, mm2, mm3
		add			eax, %1
		
		movq		mm2, [round1]
		
		paddd		mm0, mm2
		paddd		mm1, mm2
		paddd		mm4, mm2
		paddd		mm5, mm2

		psrad		mm0, 10
		psrad		mm1, 10
		psrad		mm4, 10
		psrad		mm5, 10
		
		packssdw	mm0, mm0
		packssdw	mm1, mm1
		packssdw	mm4, mm4
		packssdw	mm5, mm5
				
		punpcklwd	mm0, mm4
		punpcklwd	mm1, mm5
		
		movq		mm3, [edx]
		
		pmaddwd		mm0, mm3
		pmaddwd		mm1, mm3
		
		movq		[esp + .af_htemp0], mm0
		movq		[esp + .af_htemp1], mm1
		
		.HCUBIC		mm0, mm1, mm2, mm3
		add			eax, %1
		.HCUBIC		mm4, mm5, mm2, mm3

		movq		mm2, [round1]
		
		paddd		mm0, mm2
		paddd		mm1, mm2
		paddd		mm4, mm2
		paddd		mm5, mm2

		psrad		mm0, 10
		psrad		mm1, 10
		psrad		mm4, 10
		psrad		mm5, 10
		
		packssdw	mm0, mm0
		packssdw	mm1, mm1
		packssdw	mm4, mm4
		packssdw	mm5, mm5
				
		punpcklwd	mm0, mm4
		punpcklwd	mm1, mm5

		movq		mm2, [round2]		
		movq		mm3, [edx + 8]
		
		pmaddwd		mm0, mm3
		pmaddwd		mm1, mm3
		
		paddd		mm0, [esp + .af_htemp0]
		paddd		mm1, [esp + .af_htemp1]
		
		paddd		mm0, mm2
		paddd		mm1, mm2
		
		psrad		mm0, 18
		psrad		mm1, 18
		packssdw	mm0, mm1
%endmacro

	global	_vdasm_triblt_span_bicubic_mip_linear_mmx
_vdasm_triblt_span_bicubic_mip_linear_mmx:

;parameters
%define .p_texinfo	20

;aligned frame
%define .af_htemp0	0
%define .af_htemp1	8
%define .af_vtemp0	16
%define .af_mipbase	24
%define	.af_prevesp	28
%define .afsize		32

		push		ebp
		lea			ebp, [esp-12]
		push		edi
		push		esi
		push		ebx
		
		sub			esp, .afsize
		and			esp, -8
		
		mov			[esp + .af_prevesp], ebp
		
		mov			ebx, [ebp + .p_texinfo]
		mov			ebp, [ebx + texinfo.dst]
		mov			esi, [ebx + texinfo.w]
		shl			esi, 2
		add			ebp,esi
		neg			esi

		mov			edi, [ebx + texinfo.src]
		mov			[esp + .af_mipbase], ebx
		pxor		mm7, mm7

.xloop:

		;registers:
		;	eax		base texel address
		;	ebx		first mip info
		;	ecx		horizontal filter
		;	edx		vertical filter
		;	esi		horizontal count
		;	edi		mipspan
		;	ebp		destination

		;fetch mipmap 1
		.SETUPADDR	0
		.VCUBIC		[ebx+mipmap.pitch]
		
		movq		[esp + .af_vtemp0], mm0

		;fetch mipmap 2		
		.SETUPADDR	1
		.VCUBIC		[ebx+mipmap.pitch]
		
		;blend mips
		movq		mm1, [esp + .af_vtemp0]
		
		psubw		mm0, mm1

		movd		mm3,[edi+mipspan.lambda]
		punpcklwd	mm3,mm3
		punpckldq	mm3,mm3
		psllw		mm3,8
		psrlq		mm3,1
		
		paddw		mm0,mm0
		pmulhw		mm0,mm3
		paddw		mm0,mm1
		packuswb	mm0,mm0

		movd		dword [ebp+esi],mm0
		add			edi, mipspan_size
		add			esi,4
		jnc			.xloop

		mov			esp, [esp + .af_prevesp]
		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		emms
		ret

		end