Welcome to mirror list, hosted at ThFree Co, Russian Federation.

a_resample_sse41.asm « source « Kasumi « VirtualDub « thirdparty « src - github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: cf7332cb2bf12db1b64d524e80d1a2f8dbbbe8c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
		segment	.rdata, align=16

round		dq		0000000000002000h
colround	dq		0000200000002000h

		segment	.text
		
		global		_vdasm_resize_table_row_8_k8_4x_SSE41
_vdasm_resize_table_row_8_k8_4x_SSE41:
		push		ebp
		push		edi
		push		esi
		push		ebx

		movq		xmm6, [round]
		pshufd		xmm6, xmm6, 0

		mov			ebp, [esp +  4 + 16]		;ebp = dst
		mov			esi, [esp + 12 + 16]		;esi = width
		mov			edi, [esp + 16 + 16]		;edi = kernel
.yloop:
		;eax = temp
		;ebx = temp
		;ecx = temp
		;edx = temp
		;esi = horiz counter
		;edi = filter list
		;ebp = destination

		mov			eax, [edi+0]
		mov			ebx, [edi+4]
		mov			ecx, [edi+8]
		mov			edx, [esp+8+16]
		add			eax, edx
		add			ebx, edx
		add			ecx, edx
		add			edx, [edi+12]

		pmovzxbw	xmm0, [eax]
		pmaddwd		xmm0, [edi+10h]
		pmovzxbw	xmm1, [ebx]
		pmaddwd		xmm1, [edi+20h]
		pmovzxbw	xmm2, [ecx]
		pmaddwd		xmm2, [edi+30h]
		pmovzxbw	xmm3, [edx]
		pmaddwd		xmm3, [edi+40h]
		add			edi, 50h
		phaddd		xmm0, xmm1
		phaddd		xmm2, xmm3
		phaddd		xmm0, xmm2
		paddd		xmm0, xmm6
		psrad		xmm0, 14
		packssdw	xmm0, xmm0
		packuswb	xmm0, xmm0
		movd		[ebp], xmm0

		add			ebp, 4
		sub			esi, 1
		jne			.yloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		ret

		global		_vdasm_resize_table_row_8_k16_4x_SSE41
_vdasm_resize_table_row_8_k16_4x_SSE41:
		push		ebp
		push		edi
		push		esi
		push		ebx

		movq		xmm6, [round]
		pshufd		xmm6, xmm6, 0

		mov			ebp, [esp +  4 + 16]		;ebp = dst
		mov			esi, [esp + 12 + 16]		;esi = width
		mov			edi, [esp + 16 + 16]		;edi = kernel
.yloop:
		;eax = temp
		;ebx = temp
		;ecx = temp
		;edx = temp
		;esi = horiz counter
		;edi = filter list
		;ebp = destination

		mov			eax, [edi+0]
		mov			ebx, [edi+4]
		mov			ecx, [edi+8]
		mov			edx, [esp+8+16]
		add			eax, edx
		add			ebx, edx
		add			ecx, edx
		add			edx, [edi+12]

		pmovzxbw	xmm0, [eax]
		pmaddwd		xmm0, [edi+10h]
		pmovzxbw	xmm1, [ebx]
		pmaddwd		xmm1, [edi+20h]
		pmovzxbw	xmm2, [ecx]
		pmaddwd		xmm2, [edi+30h]
		pmovzxbw	xmm3, [edx]
		pmaddwd		xmm3, [edi+40h]
		pmovzxbw	xmm4, [eax+8]
		pmaddwd		xmm4, [edi+50h]
		pmovzxbw	xmm5, [ebx+8]
		pmaddwd		xmm5, [edi+60h]
		paddd		xmm0, xmm4
		pmovzxbw	xmm4, [ecx+8]
		pmaddwd		xmm4, [edi+70h]
		paddd		xmm1, xmm5
		pmovzxbw	xmm5, [edx+8]
		pmaddwd		xmm5, [edi+80h]
		paddd		xmm2, xmm4
		paddd		xmm3, xmm5
		add			edi, 90h
		phaddd		xmm0, xmm1
		phaddd		xmm2, xmm3
		phaddd		xmm0, xmm2
		paddd		xmm0, xmm6
		psrad		xmm0, 14
		packssdw	xmm0, xmm0
		packuswb	xmm0, xmm0
		movd		[ebp], xmm0

		add			ebp, 4
		sub			esi, 1
		jne			.yloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		ret

		global		_vdasm_resize_table_row_8_SSE41
_vdasm_resize_table_row_8_SSE41:
		push		ebp
		push		edi
		push		esi
		push		ebx

		pxor		xmm7, xmm7
		movq		xmm6, [round]

		mov			edi, [esp +  4 + 16]		;edi = dst
		mov			ebx, [esp +  8 + 16]		;ebx = src
		mov			ebp, [esp + 12 + 16]		;ebp = width
		mov			edx, [esp + 16 + 16]		;edx = kernel
.yloop:
		;eax = temp
		;ebx = source base address
		;ecx = (temp) source
		;edx = filter list
		;esi = (temp) kernel width
		;edi = destination
		;ebp = horiz counter

		mov			eax, [edx]
		add			edx, 16
		lea			ecx, [ebx + eax]
		mov			esi, [esp + 20 + 16]		;esi = kernel width

		movq		xmm2, xmm6
.xloop:
		pmovzxbw	xmm0, [ecx]
		add			ecx, 8
		pmaddwd		xmm0, [edx]
		paddd		xmm2, xmm0
		add			edx, 16
		sub			esi, 8
		jne			.xloop

		phaddd		xmm2, xmm2
		phaddd		xmm2, xmm2
		psrad		xmm2, 14
		packssdw	xmm2, xmm2
		packuswb	xmm2, xmm2
		movd		eax, xmm2
		mov			[edi], al
		add			edi, 1
		sub			ebp, 1
		jne			.yloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		ret
		

		global		_vdasm_resize_table_col_8_k2_SSE41
_vdasm_resize_table_col_8_k2_SSE41:
		push		ebp
		push		edi
		push		esi
		push		ebx

		movq		xmm6, [colround]
		pshufd		xmm6, xmm6, 0

		mov			esi, [esp +  4 + 16]		;esi = dst
		mov			edi, [esp + 16 + 16]		;edi = kernel
		mov			ebp, [esp + 12 + 16]		;ebp = width

		movq		xmm7, [edi]
		pshufd		xmm7, xmm7, 0

		mov			edx, [esp +  8 + 16]		;ebx = srcs
		mov			eax, [edx+0]
		mov			ebx, [edx+4]
		add			eax, ebp
		add			ebx, ebp
		neg			ebp
		
.yloop:
		;eax = row0
		;ebx = row1
		;ecx =
		;edx =
		;edi = kernel
		;esi = dest
		;ebp = width counter

		movd		xmm0, [eax+ebp]
		movd		xmm2, [ebx+ebp]
		punpcklbw	xmm0, xmm2
		pmovzxbw	xmm0, xmm0
		pmaddwd		xmm0, xmm7

		paddd		xmm0, xmm6

		psrad		xmm0, 14
		packssdw	xmm0, xmm0
		packuswb	xmm0, xmm0
		movd		[esi], xmm0
		add			esi, 4
		add			ebp, 4
		jnz			.yloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		ret

		global		_vdasm_resize_table_col_8_k4_SSE41
_vdasm_resize_table_col_8_k4_SSE41:
		push		ebp
		push		edi
		push		esi
		push		ebx

		movq		xmm7, [colround]
		pshufd		xmm7, xmm7, 0

		mov			esi, [esp +  4 + 16]		;esi = dst
		mov			edi, [esp + 16 + 16]		;edi = kernel

		movdqu		xmm6, [edi]
		pshufd		xmm5, xmm6, 0
		pshufd		xmm6, xmm6, 0aah

		mov			edx, [esp +  8 + 16]		;ebx = srcs
		mov			ebp, [esp + 12 + 16]
		mov			eax, [edx+0]
		mov			ebx, [edx+4]
		mov			ecx, [edx+8]
		mov			edx, [edx+12]
		lea			eax, [eax+ebp-4]
		lea			ebx, [ebx+ebp-4]
		lea			ecx, [ecx+ebp-4]
		lea			edx, [edx+ebp-4]
		lea			esi, [esi+ebp-4]
		neg			ebp
		add			ebp,4
		jz			.odd
.yloop:
		;eax = row0
		;ebx = row1
		;ecx = row2
		;edx = row3
		;edi = kernel
		;esi = dest
		;ebp = width counter

		movd		xmm0, [eax+ebp]
		movd		xmm1, [ebx+ebp]
		punpcklbw	xmm0, xmm1

		movd		xmm1, [ecx+ebp]
		movd		xmm2, [edx+ebp]
		punpcklbw	xmm1, xmm2

		movd		xmm2, [eax+ebp+4]
		movd		xmm3, [ebx+ebp+4]
		punpcklbw	xmm2, xmm3
		
		movd		xmm3, [ecx+ebp+4]
		movd		xmm4, [edx+ebp+4]
		punpcklbw	xmm3, xmm4
		
		pmovzxbw	xmm0, xmm0
		pmaddwd		xmm0, xmm5
		
		pmovzxbw	xmm1, xmm1
		pmaddwd		xmm1, xmm6
		
		pmovzxbw	xmm2, xmm2
		pmaddwd		xmm2, xmm5
		
		pmovzxbw	xmm3, xmm3
		pmaddwd		xmm3, xmm6

		paddd		xmm0, xmm1
		paddd		xmm2, xmm3

		paddd		xmm0, xmm7
		paddd		xmm2, xmm7

		psrad		xmm0, 14
		psrad		xmm2, 14
		
		packssdw	xmm0, xmm2
		packuswb	xmm0, xmm0
		movq		[esi+ebp], xmm0
		add			ebp, 8
		js			.yloop
		jnz			.noodd

.odd:
		movd		xmm0, [eax]
		movd		xmm1, [ebx]
		movd		xmm2, [ecx]
		movd		xmm3, [edx]
		punpcklbw	xmm0, xmm1
		punpcklbw	xmm2, xmm3
		pmovzxbw	xmm0, xmm0
		pmovzxbw	xmm2, xmm2
		pmaddwd		xmm0, xmm5
		pmaddwd		xmm2, xmm6
		paddd		xmm0, xmm2
		paddd		xmm0, xmm7
		psrad		xmm0, 14
		packssdw	xmm0, xmm0
		packuswb	xmm0, xmm0
		movd		[esi], xmm0
.noodd:

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		ret

		end