1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
/*
* Copyright (C) 2010-2013 Hendrik Leppkes
* http://www.1f0.de
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "stdafx.h"
#include <emmintrin.h>
#include "pixconv_internal.h"
#include "pixconv_sse2_templates.h"
DECLARE_CONV_FUNC_IMPL(convert_yuv444_y410)
{
const uint16_t *y = (const uint16_t *)src[0];
const uint16_t *u = (const uint16_t *)src[1];
const uint16_t *v = (const uint16_t *)src[2];
const ptrdiff_t inStride = srcStride[0] >> 1;
const ptrdiff_t outStride = dstStride[0];
int shift = 10 - bpp;
ptrdiff_t line, i;
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
xmm7 = _mm_set1_epi32(0xC0000000);
xmm6 = _mm_setzero_si128();
_mm_sfence();
for (line = 0; line < height; ++line) {
__m128i *dst128 = (__m128i *)(dst[0] + line * outStride);
for (i = 0; i < width; i+=8) {
PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+i));
xmm0 = _mm_slli_epi16(xmm0, shift);
PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i));
xmm1 = _mm_slli_epi16(xmm1, shift);
PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (v+i));
xmm2 = _mm_slli_epi16(xmm2, shift+4); // +4 so its directly aligned properly (data from bit 14 to bit 4)
xmm3 = _mm_unpacklo_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU
xmm4 = _mm_unpackhi_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU
xmm3 = _mm_or_si128(xmm3, xmm7); // AVVVVV00000UUUUU
xmm4 = _mm_or_si128(xmm4, xmm7); // AVVVVV00000UUUUU
xmm5 = _mm_unpacklo_epi16(xmm0, xmm6); // 00000000000YYYYY
xmm2 = _mm_unpackhi_epi16(xmm0, xmm6); // 00000000000YYYYY
xmm5 = _mm_slli_epi32(xmm5, 10); // 000000YYYYY00000
xmm2 = _mm_slli_epi32(xmm2, 10); // 000000YYYYY00000
xmm3 = _mm_or_si128(xmm3, xmm5); // AVVVVVYYYYYUUUUU
xmm4 = _mm_or_si128(xmm4, xmm2); // AVVVVVYYYYYUUUUU
// Write data back
_mm_stream_si128(dst128++, xmm3);
_mm_stream_si128(dst128++, xmm4);
}
y += inStride;
u += inStride;
v += inStride;
}
return S_OK;
}
|