libav/libavcodec/arm/vp9mc_neon.S ffmpeg/libavcodec/arm/vp9mc_neon.S
  1 /*
  2  * Copyright (c) 2016 Google Inc.
  3  *
  4  * This file is part of Libav.                                                                                                                                                 
  5  *
  6  * Libav is free software; you can redistribute it and/or                                                                                                                      
  7  * modify it under the terms of the GNU Lesser General Public
  8  * License as published by the Free Software Foundation; either
  9  * version 2.1 of the License, or (at your option) any later version.
 10  *
 11  * Libav is distributed in the hope that it will be useful,                                                                                                                    
 12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14  * Lesser General Public License for more details.
 15  *
 16  * You should have received a copy of the GNU Lesser General Public
 17  * License along with Libav; if not, write to the Free Software                                                                                                                
 18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 19  */
 20 
 21 #include "libavutil/arm/asm.S"
 22 
 23 @ All public functions in this file have the following signature:
 24 +--259 lines: @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,----------------------------------------------------------------------------------------------------
283         @ postincrement
284 .if \size >= 16
285         sub             r3,  r3,  r5
286         sub             r3,  r3,  #8
287 .endif
288         @ Load the filter vector
289         vld1.8          {d0},  [r12,:64]                                                                                                                                       
290         vmovl.s8        q0,  d0                                                                                                                                                
291 1:
292 .if \size >= 16
293         mov             r12, r5
294 .endif
295         @ Load src
296 .if \size >= 16
297 +--116 lines: vld1.8          {d18, d19, d20}, [r2]!-------------------------------------------------------------------------------------------------------------------------------
413         ldr             r5,  [sp, #68]
414 .else
415         ldr             r4,  [sp, #16]
416         ldr             r5,  [sp, #20]
417 .endif
418         movrelx         r12, X(ff_vp9_subpel_filters), r6
419         add             r12, r12, 120*\offset - 8                                                                                                                              
420         cmp             r5,  #8
421         add             r12, r12, r5, lsl #3                                                                                                                                   
422         mov             r5,  #\size
423 .if \size >= 16
424         bge             \type\()_8tap_16h_34
425         b               \type\()_8tap_16h_43
426 .else
427         bge             \type\()_8tap_\size\()h_34
428 +--120 lines: b               \type\()_8tap_\size\()h_43---------------------------------------------------------------------------------------------------------------------------
548 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
549 @ and idx1 is the other one of them.
550 .macro do_8tap_8v type, idx1, idx2
551 function \type\()_8tap_8v_\idx1\idx2
552         sub             r2,  r2,  r3, lsl #1
553         sub             r2,  r2,  r3
554         vld1.8          {d0},  [r12, :64]                                                                                                                                      
555         vmovl.s8        q0,  d0                                                                                                                                                
556 1:
557         mov             r12, r4
558 
559         loadl           q5,  q6,  q7
560         loadl           q8,  q9,  q10, q11
561 2:
562 +-- 57 lines: loadl           q12, q13, q14, q15-----------------------------------------------------------------------------------------------------------------------------------
619 @ registers are rows 3 and 4.
620 @ This only is designed to work for 4 or 8 output lines.
621 .macro do_8tap_4v type, idx1, idx2
622 function \type\()_8tap_4v_\idx1\idx2
623         sub             r2,  r2,  r3, lsl #1
624         sub             r2,  r2,  r3
625         vld1.8          {d0},  [r12, :64]                                                                                                                                      
626         vmovl.s8        q0,  d0                                                                                                                                                
627 
628         vld1.32         {d2[]},   [r2], r3
629         vld1.32         {d3[]},   [r2], r3
630         vld1.32         {d4[]},   [r2], r3
631         vld1.32         {d5[]},   [r2], r3
632         vld1.32         {d6[]},   [r2], r3
633 +-- 57 lines: vld1.32         {d7[]},   [r2], r3-----------------------------------------------------------------------------------------------------------------------------------
690 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
691         push            {r4-r5}
692         vpush           {q4-q7}
693         ldr             r4,  [sp, #72]
694         movrelx         r12, X(ff_vp9_subpel_filters), r5
695         ldr             r5,  [sp, #80]
696         add             r12, r12, 120*\offset - 8                                                                                                                              
697         add             r12, r12, r5, lsl #3                                                                                                                                   
698         cmp             r5,  #8
699         mov             r5,  #\size
700 .if \size >= 8
701         bge             \type\()_8tap_8v_34
702         b               \type\()_8tap_8v_43
703 .else
704 +-- 20 lines: bge             \type\()_8tap_4v_34----------------------------------------------------------------------------------------------------------------------------------
  1 /*
  2  * Copyright (c) 2016 Google Inc.
  3  *
  4  * This file is part of FFmpeg.                                                                                                                                                
  5  *
  6  * FFmpeg is free software; you can redistribute it and/or                                                                                                                     
  7  * modify it under the terms of the GNU Lesser General Public
  8  * License as published by the Free Software Foundation; either
  9  * version 2.1 of the License, or (at your option) any later version.
 10  *
 11  * FFmpeg is distributed in the hope that it will be useful,                                                                                                                   
 12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14  * Lesser General Public License for more details.
 15  *
 16  * You should have received a copy of the GNU Lesser General Public
 17  * License along with FFmpeg; if not, write to the Free Software                                                                                                               
 18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 19  */
 20 
 21 #include "libavutil/arm/asm.S"
 22 
 23 @ All public functions in this file have the following signature:
 24 +--259 lines: @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,----------------------------------------------------------------------------------------------------
283         @ postincrement
284 .if \size >= 16
285         sub             r3,  r3,  r5
286         sub             r3,  r3,  #8
287 .endif
288         @ Load the filter vector
289         vld1.16         {q0},  [r12,:128]                                                                                                                                      
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
290 1:
291 .if \size >= 16
292         mov             r12, r5
293 .endif
294         @ Load src
295 .if \size >= 16
296 +--116 lines: vld1.8          {d18, d19, d20}, [r2]!-------------------------------------------------------------------------------------------------------------------------------
412         ldr             r5,  [sp, #68]
413 .else
414         ldr             r4,  [sp, #16]
415         ldr             r5,  [sp, #20]
416 .endif
417         movrelx         r12, X(ff_vp9_subpel_filters), r6
418         add             r12, r12, 256*\offset                                                                                                                                  
419         cmp             r5,  #8
420         add             r12, r12, r5, lsl #4                                                                                                                                   
421         mov             r5,  #\size
422 .if \size >= 16
423         bge             \type\()_8tap_16h_34
424         b               \type\()_8tap_16h_43
425 .else
426         bge             \type\()_8tap_\size\()h_34
427 +--120 lines: b               \type\()_8tap_\size\()h_43---------------------------------------------------------------------------------------------------------------------------
547 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
548 @ and idx1 is the other one of them.
549 .macro do_8tap_8v type, idx1, idx2
550 function \type\()_8tap_8v_\idx1\idx2
551         sub             r2,  r2,  r3, lsl #1
552         sub             r2,  r2,  r3
553         vld1.16         {q0},  [r12, :128]                                                                                                                                     
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
554 1:
555         mov             r12, r4
556 
557         loadl           q5,  q6,  q7
558         loadl           q8,  q9,  q10, q11
559 2:
560 +-- 57 lines: loadl           q12, q13, q14, q15-----------------------------------------------------------------------------------------------------------------------------------
617 @ registers are rows 3 and 4.
618 @ This only is designed to work for 4 or 8 output lines.
619 .macro do_8tap_4v type, idx1, idx2
620 function \type\()_8tap_4v_\idx1\idx2
621         sub             r2,  r2,  r3, lsl #1
622         sub             r2,  r2,  r3
623         vld1.16         {q0},  [r12, :128]                                                                                                                                     
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
624 
625         vld1.32         {d2[]},   [r2], r3
626         vld1.32         {d3[]},   [r2], r3
627         vld1.32         {d4[]},   [r2], r3
628         vld1.32         {d5[]},   [r2], r3
629         vld1.32         {d6[]},   [r2], r3
630 +-- 57 lines: vld1.32         {d7[]},   [r2], r3-----------------------------------------------------------------------------------------------------------------------------------
687 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
688         push            {r4-r5}
689         vpush           {q4-q7}
690         ldr             r4,  [sp, #72]
691         movrelx         r12, X(ff_vp9_subpel_filters), r5
692         ldr             r5,  [sp, #80]
693         add             r12, r12, 256*\offset                                                                                                                                  
694         add             r12, r12, r5, lsl #4                                                                                                                                   
695         cmp             r5,  #8
696         mov             r5,  #\size
697 .if \size >= 8
698         bge             \type\()_8tap_8v_34
699         b               \type\()_8tap_8v_43
700 .else
701 +-- 20 lines: bge             \type\()_8tap_4v_34----------------------------------------------------------------------------------------------------------------------------------