From a10c435ec7fbba8873f3547313e4cf571254a6cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= Date: Tue, 14 Jan 2014 08:09:48 +0100 Subject: [PATCH] vp9/x86: add AVX for itxfm and lpf. 4412 decicycles in ff_vp9_loop_filter_h_16_16_ssse3, 4193462 runs, 842 skips 3600 decicycles in ff_vp9_loop_filter_h_16_16_avx, 4193621 runs, 683 skips 3010 decicycles in ff_vp9_loop_filter_v_16_16_ssse3, 4193528 runs, 776 skips 2678 decicycles in ff_vp9_loop_filter_v_16_16_avx, 4193742 runs, 562 skips 23025 decicycles in ff_vp9_idct_idct_32x32_add_ssse3, 2096871 runs, 281 skips 19943 decicycles in ff_vp9_idct_idct_32x32_add_avx, 2096815 runs, 337 skips 4675 decicycles in ff_vp9_idct_idct_16x16_add_ssse3, 4194018 runs, 286 skips 3980 decicycles in ff_vp9_idct_idct_16x16_add_avx, 4194022 runs, 282 skips 967 decicycles in ff_vp9_idct_idct_8x8_add_ssse3, 16776972 runs, 244 skips 887 decicycles in ff_vp9_idct_idct_8x8_add_avx, 16777002 runs, 214 skips --- libavcodec/x86/vp9dsp_init.c | 18 ++++++++++++++++++ libavcodec/x86/vp9itxfm.asm | 21 ++++++++++++++++++--- libavcodec/x86/vp9lpf.asm | 7 ++++++- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index c3ef73d..3651641 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -159,11 +159,16 @@ filters_8tap_1d_fn3(avg) void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +void ff_vp9_idct_idct_8x8_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +void ff_vp9_idct_idct_16x16_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +void ff_vp9_idct_idct_32x32_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); +void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); +void ff_vp9_loop_filter_h_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); #endif /* HAVE_YASM */ @@ -231,6 +236,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) } } + if (EXTERNAL_AVX(cpu_flags)) { + if (ARCH_X86_64) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx; + } + } + #undef init_fpel #undef init_subpel1 #undef init_subpel2 diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index fe04b81..33c0bc7 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -289,7 +289,8 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob VP9_STORE_2X 10, 11, 6, 7, 4 %endmacro -INIT_XMM ssse3 +%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1 +INIT_XMM %1 cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob mova m12, [pw_11585x2] ; often used @@ -376,6 +377,10 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob mova [blockq+112], m4 VP9_IDCT8_WRITEOUT RET +%endmacro + +VP9_IDCT_IDCT_8x8_ADD_XMM ssse3 +VP9_IDCT_IDCT_8x8_ADD_XMM avx ;--------------------------------------------------------------------------------------------- ; void vp9_idct_idct_16x16_add_(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); @@ -655,7 +660,8 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob mova [dstq+%7], m%4 %endmacro -INIT_XMM ssse3 +%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 +INIT_XMM %1 cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob ; 2x2=eob=3, 4x4=eob=10 cmp eobd, 38 @@ -724,6 +730,10 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob ; use that to zero out block coefficients ZERO_BLOCK blockq, 32, 16, m0 RET +%endmacro + +VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 +VP9_IDCT_IDCT_16x16_ADD_XMM avx ;--------------------------------------------------------------------------------------------- ; void vp9_idct_idct_32x32_add_(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); @@ -1102,7 +1112,8 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob %endif %endmacro -INIT_XMM ssse3 +%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 +INIT_XMM %1 cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob cmp eobd, 135 jg .idctfull @@ -1213,5 +1224,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob ; use that to zero out block coefficients ZERO_BLOCK blockq, 64, 32, m7 RET +%endmacro + +VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 +VP9_IDCT_IDCT_32x32_ADD_XMM avx %endif ; x86-64 diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index e2dc8d9..c5e5df9 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -655,12 +655,17 @@ SECTION .text %endif %endmacro -INIT_XMM ssse3 +%macro LPF_16_16_VH 1 +INIT_XMM %1 cglobal vp9_loop_filter_v_16_16, 5,8,16, dst, stride, E, I, H, mstride, dst1, dst2 LPF_16_16 v RET cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst1, dst2 LPF_16_16 h RET +%endmacro + +LPF_16_16_VH ssse3 +LPF_16_16_VH avx %endif ; x86-64 -- 1.8.5.2