Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FFmpeg/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLauri Kasanen <cand@gmx.com>2018-11-17 11:12:14 +0300
committerMichael Niedermayer <michael@niedermayer.cc>2018-11-26 04:56:25 +0300
commit46c5693ea3a9364e24e2f5336bcdb5b191a2329f (patch)
treee1b3f250598d8e62048e8ecdc8fa75f591fc2562 /libswscale/ppc
parentcc25529420e3c559fcb4c7a993f112983da7d397 (diff)
swscale/output: Altivec-optimize yuv2plane1_8
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p \ -f null -vframes 100 -v error -nostats - 1158 UNITS in planar1, 65528 runs, 8 skips -cpuflags 0 19082 UNITS in planar1, 65533 runs, 3 skips 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version takes as many cycles as the x86 SSE2 version, yikes it's fast. Note that this function uses VSX instructions, but is not marked so. This is because several existing functions also make that mistake. I'll submit a patch moving them once this is reviewed. Signed-off-by: Lauri Kasanen <cand@gmx.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libswscale/ppc')
-rw-r--r--libswscale/ppc/swscale_altivec.c53
1 files changed, 53 insertions, 0 deletions
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 2fb2337769..8c6056d841 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -324,6 +324,53 @@ static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
}
}
}
+
+static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset, int start)
+{
+ int i;
+ for (i = start; i < dstW; i++) {
+ int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+ dest[i] = av_clip_uint8(val);
+ }
+}
+
+static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ const int dst_u = -(uintptr_t)dest & 15;
+ int i, j;
+ LOCAL_ALIGNED(16, int16_t, val, [16]);
+ const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
+ vector int16_t vi, vileft, ditherleft, ditherright;
+ vector uint8_t vd;
+
+ for (j = 0; j < 16; j++) {
+ val[j] = dither[(dst_u + offset + j) & 7];
+ }
+
+ ditherleft = vec_ld(0, val);
+ ditherright = vec_ld(0, &val[8]);
+
+ yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
+
+ for (i = dst_u; i < dstW - 15; i += 16) {
+
+ vi = vec_vsx_ld(0, &src[i]);
+ vi = vec_adds(ditherleft, vi);
+ vileft = vec_sra(vi, shifts);
+
+ vi = vec_vsx_ld(0, &src[i + 8]);
+ vi = vec_adds(ditherright, vi);
+ vi = vec_sra(vi, shifts);
+
+ vd = vec_packsu(vileft, vi);
+ vec_st(vd, 0, &dest[i]);
+ }
+
+ yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
+}
+
#endif /* HAVE_ALTIVEC */
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
@@ -367,6 +414,12 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
+
+ switch (c->dstBpc) {
+ case 8:
+ c->yuv2plane1 = yuv2plane1_8_altivec;
+ break;
+ }
}
#endif /* HAVE_ALTIVEC */
}