diff options
Diffstat (limited to 'extern/libopenjpeg/dwt.c')
-rw-r--r-- | extern/libopenjpeg/dwt.c | 83 |
1 files changed, 58 insertions, 25 deletions
diff --git a/extern/libopenjpeg/dwt.c b/extern/libopenjpeg/dwt.c index 357b475b9ac..0fbfc2033fe 100644 --- a/extern/libopenjpeg/dwt.c +++ b/extern/libopenjpeg/dwt.c @@ -64,12 +64,12 @@ typedef struct v4dwt_local { int cas ; } v4dwt_t ; -static const float dwt_alpha = 1.586134342f; // 12994 -static const float dwt_beta = 0.052980118f; // 434 -static const float dwt_gamma = -0.882911075f; // -7233 -static const float dwt_delta = -0.443506852f; // -3633 +static const float dwt_alpha = 1.586134342f; /* 12994 */ +static const float dwt_beta = 0.052980118f; /* 434 */ +static const float dwt_gamma = -0.882911075f; /* -7233 */ +static const float dwt_delta = -0.443506852f; /* -3633 */ -static const float K = 1.230174105f; // 10078 +static const float K = 1.230174105f; /* 10078 */ /* FIXME: What is this constant? */ static const float c13318 = 1.625732422f; @@ -527,7 +527,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1 int w = tilec->x1 - tilec->x0; - h.mem = opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int)); + h.mem = (int*)opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int)); v.mem = h.mem; while( --numres) { @@ -570,6 +570,20 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in int count = w->sn; int i, k; for(k = 0; k < 2; ++k){ + if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) { + /* Fast code path */ + for(i = 0; i < count; ++i){ + int j = i; + bi[i*8 ] = a[j]; + j += x; + bi[i*8 + 1] = a[j]; + j += x; + bi[i*8 + 2] = a[j]; + j += x; + bi[i*8 + 3] = a[j]; + } + } else { + /* Slow code path */ for(i = 0; i < count; ++i){ int j = i; bi[i*8 ] = a[j]; @@ -583,6 +597,7 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in if(j > size) continue; bi[i*8 + 3] = a[j]; } + } bi = (float*) (w->wavelet + 1 - w->cas); a += w->sn; size -= w->sn; @@ -608,9 +623,21 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){ static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){ __m128* restrict vw = (__m128*) w; int i; + /* 4x unrolled loop */ + for(i = 0; i < count >> 2; ++i){ + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + *vw = _mm_mul_ps(*vw, c); + vw += 2; + } + count &= 3; for(i = 0; i < count; ++i){ - __m128 tmp = vw[i*2]; - vw[i*2] = _mm_mul_ps(tmp, c); + *vw = _mm_mul_ps(*vw, c); + vw += 2; } } @@ -618,14 +645,16 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){ __m128* restrict vl = (__m128*) l; __m128* restrict vw = (__m128*) w; int i; + __m128 tmp1, tmp2, tmp3; + tmp1 = vl[0]; for(i = 0; i < m; ++i){ - __m128 tmp1 = vl[ 0]; - __m128 tmp2 = vw[-1]; - __m128 tmp3 = vw[ 0]; + tmp2 = vw[-1]; + tmp3 = vw[ 0]; vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); - vl = vw; + tmp1 = tmp3; vw += 2; } + vl = vw - 2; if(m >= k){ return; } @@ -773,19 +802,24 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){ h.dn = rw - h.sn; h.cas = res->x0 % 2; - for(j = rh; j > 0; j -= 4){ + for(j = rh; j > 3; j -= 4){ + int k; v4dwt_interleave_h(&h, aj, w, bufsize); v4dwt_decode(&h); - if(j >= 4){ - int k; for(k = rw; --k >= 0;){ aj[k ] = h.wavelet[k].f[0]; aj[k+w ] = h.wavelet[k].f[1]; aj[k+w*2] = h.wavelet[k].f[2]; aj[k+w*3] = h.wavelet[k].f[3]; } - }else{ + aj += w*4; + bufsize -= w*4; + } + if (rh & 0x03) { int k; + j = rh & 0x03; + v4dwt_interleave_h(&h, aj, w, bufsize); + v4dwt_decode(&h); for(k = rw; --k >= 0;){ switch(j) { case 3: aj[k+w*2] = h.wavelet[k].f[2]; @@ -794,30 +828,29 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){ } } } - aj += w*4; - bufsize -= w*4; - } v.dn = rh - v.sn; v.cas = res->y0 % 2; aj = (float*) tilec->data; - for(j = rw; j > 0; j -= 4){ + for(j = rw; j > 3; j -= 4){ + int k; v4dwt_interleave_v(&v, aj, w); v4dwt_decode(&v); - if(j >= 4){ - int k; for(k = 0; k < rh; ++k){ memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float)); } - }else{ + aj += 4; + } + if (rw & 0x03){ int k; + j = rw & 0x03; + v4dwt_interleave_v(&v, aj, w); + v4dwt_decode(&v); for(k = 0; k < rh; ++k){ memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float)); } } - aj += 4; - } } opj_aligned_free(h.wavelet); |