Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'extern/libopenjpeg/dwt.c')
-rw-r--r--extern/libopenjpeg/dwt.c83
1 files changed, 58 insertions, 25 deletions
diff --git a/extern/libopenjpeg/dwt.c b/extern/libopenjpeg/dwt.c
index 357b475b9ac..0fbfc2033fe 100644
--- a/extern/libopenjpeg/dwt.c
+++ b/extern/libopenjpeg/dwt.c
@@ -64,12 +64,12 @@ typedef struct v4dwt_local {
int cas ;
} v4dwt_t ;
-static const float dwt_alpha = 1.586134342f; // 12994
-static const float dwt_beta = 0.052980118f; // 434
-static const float dwt_gamma = -0.882911075f; // -7233
-static const float dwt_delta = -0.443506852f; // -3633
+static const float dwt_alpha = 1.586134342f; /* 12994 */
+static const float dwt_beta = 0.052980118f; /* 434 */
+static const float dwt_gamma = -0.882911075f; /* -7233 */
+static const float dwt_delta = -0.443506852f; /* -3633 */
-static const float K = 1.230174105f; // 10078
+static const float K = 1.230174105f; /* 10078 */
/* FIXME: What is this constant? */
static const float c13318 = 1.625732422f;
@@ -527,7 +527,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
int w = tilec->x1 - tilec->x0;
- h.mem = opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
+ h.mem = (int*)opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
v.mem = h.mem;
while( --numres) {
@@ -570,6 +570,20 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
int count = w->sn;
int i, k;
for(k = 0; k < 2; ++k){
+ if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+ /* Fast code path */
+ for(i = 0; i < count; ++i){
+ int j = i;
+ bi[i*8 ] = a[j];
+ j += x;
+ bi[i*8 + 1] = a[j];
+ j += x;
+ bi[i*8 + 2] = a[j];
+ j += x;
+ bi[i*8 + 3] = a[j];
+ }
+ } else {
+ /* Slow code path */
for(i = 0; i < count; ++i){
int j = i;
bi[i*8 ] = a[j];
@@ -583,6 +597,7 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
if(j > size) continue;
bi[i*8 + 3] = a[j];
}
+ }
bi = (float*) (w->wavelet + 1 - w->cas);
a += w->sn;
size -= w->sn;
@@ -608,9 +623,21 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
__m128* restrict vw = (__m128*) w;
int i;
+ /* 4x unrolled loop */
+ for(i = 0; i < count >> 2; ++i){
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ }
+ count &= 3;
for(i = 0; i < count; ++i){
- __m128 tmp = vw[i*2];
- vw[i*2] = _mm_mul_ps(tmp, c);
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
}
}
@@ -618,14 +645,16 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
__m128* restrict vl = (__m128*) l;
__m128* restrict vw = (__m128*) w;
int i;
+ __m128 tmp1, tmp2, tmp3;
+ tmp1 = vl[0];
for(i = 0; i < m; ++i){
- __m128 tmp1 = vl[ 0];
- __m128 tmp2 = vw[-1];
- __m128 tmp3 = vw[ 0];
+ tmp2 = vw[-1];
+ tmp3 = vw[ 0];
vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
- vl = vw;
+ tmp1 = tmp3;
vw += 2;
}
+ vl = vw - 2;
if(m >= k){
return;
}
@@ -773,19 +802,24 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
h.dn = rw - h.sn;
h.cas = res->x0 % 2;
- for(j = rh; j > 0; j -= 4){
+ for(j = rh; j > 3; j -= 4){
+ int k;
v4dwt_interleave_h(&h, aj, w, bufsize);
v4dwt_decode(&h);
- if(j >= 4){
- int k;
for(k = rw; --k >= 0;){
aj[k ] = h.wavelet[k].f[0];
aj[k+w ] = h.wavelet[k].f[1];
aj[k+w*2] = h.wavelet[k].f[2];
aj[k+w*3] = h.wavelet[k].f[3];
}
- }else{
+ aj += w*4;
+ bufsize -= w*4;
+ }
+ if (rh & 0x03) {
int k;
+ j = rh & 0x03;
+ v4dwt_interleave_h(&h, aj, w, bufsize);
+ v4dwt_decode(&h);
for(k = rw; --k >= 0;){
switch(j) {
case 3: aj[k+w*2] = h.wavelet[k].f[2];
@@ -794,30 +828,29 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
}
}
}
- aj += w*4;
- bufsize -= w*4;
- }
v.dn = rh - v.sn;
v.cas = res->y0 % 2;
aj = (float*) tilec->data;
- for(j = rw; j > 0; j -= 4){
+ for(j = rw; j > 3; j -= 4){
+ int k;
v4dwt_interleave_v(&v, aj, w);
v4dwt_decode(&v);
- if(j >= 4){
- int k;
for(k = 0; k < rh; ++k){
memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float));
}
- }else{
+ aj += 4;
+ }
+ if (rw & 0x03){
int k;
+ j = rw & 0x03;
+ v4dwt_interleave_v(&v, aj, w);
+ v4dwt_decode(&v);
for(k = 0; k < rh; ++k){
memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float));
}
}
- aj += 4;
- }
}
opj_aligned_free(h.wavelet);