diff

libav/libavcodec/arm/vp9mc_neon.S	ffmpeg/libavcodec/arm/vp9mc_neon.S
1 /* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 / 20 21 #include "libavutil/arm/asm.S" 22 23 @ All public functions in this file have the following signature: 24 +--259 lines: @ typedef void (vp9_mc_func)(uint8_t dst, ptrdiff_t dst_stride,---------------------------------------------------------------------------------------------------- 283 @ postincrement 284 .if \size >= 16 285 sub r3, r3, r5 286 sub r3, r3, #8 287 .endif 288 @ Load the filter vector 289 vld1.8 {d0}, [r12,:64] 290 vmovl.s8 q0, d0 291 1: 292 .if \size >= 16 293 mov r12, r5 294 .endif 295 @ Load src 296 .if \size >= 16 297 +--116 lines: vld1.8 {d18, d19, d20}, [r2]!------------------------------------------------------------------------------------------------------------------------------- 413 ldr r5, [sp, #68] 414 .else 415 ldr r4, [sp, #16] 416 ldr r5, [sp, #20] 417 .endif 418 movrelx r12, X(ff_vp9_subpel_filters), r6 419 add r12, r12, 120\offset - 8 420 cmp r5, #8 421 add r12, r12, r5, lsl #3 422 mov r5, #\size 423 .if \size >= 16 424 bge \type\()_8tap_16h_34 425 b \type\()_8tap_16h_43 426 .else 427 bge \type\()_8tap_\size\()h_34 428 +--120 lines: b \type\()_8tap_\size\()h_43--------------------------------------------------------------------------------------------------------------------------- 548 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4) 549 @ and idx1 is the other one of them. 550 .macro do_8tap_8v type, idx1, idx2 551 function \type\()_8tap_8v_\idx1\idx2 552 sub r2, r2, r3, lsl #1 553 sub r2, r2, r3 554 vld1.8 {d0}, [r12, :64] 555 vmovl.s8 q0, d0 556 1: 557 mov r12, r4 558 559 loadl q5, q6, q7 560 loadl q8, q9, q10, q11 561 2: 562 +-- 57 lines: loadl q12, q13, q14, q15----------------------------------------------------------------------------------------------------------------------------------- 619 @ registers are rows 3 and 4. 620 @ This only is designed to work for 4 or 8 output lines. 621 .macro do_8tap_4v type, idx1, idx2 622 function \type\()_8tap_4v_\idx1\idx2 623 sub r2, r2, r3, lsl #1 624 sub r2, r2, r3 625 vld1.8 {d0}, [r12, :64] 626 vmovl.s8 q0, d0 627 628 vld1.32 {d2[]}, [r2], r3 629 vld1.32 {d3[]}, [r2], r3 630 vld1.32 {d4[]}, [r2], r3 631 vld1.32 {d5[]}, [r2], r3 632 vld1.32 {d6[]}, [r2], r3 633 +-- 57 lines: vld1.32 {d7[]}, [r2], r3----------------------------------------------------------------------------------------------------------------------------------- 690 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 691 push {r4-r5} 692 vpush {q4-q7} 693 ldr r4, [sp, #72] 694 movrelx r12, X(ff_vp9_subpel_filters), r5 695 ldr r5, [sp, #80] 696 add r12, r12, 120*\offset - 8 697 add r12, r12, r5, lsl #3 698 cmp r5, #8 699 mov r5, #\size 700 .if \size >= 8 701 bge \type\()_8tap_8v_34 702 b \type\()_8tap_8v_43 703 .else 704 +-- 20 lines: bge \type\()_8tap_4v_34----------------------------------------------------------------------------------------------------------------------------------	1 /* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 / 20 21 #include "libavutil/arm/asm.S" 22 23 @ All public functions in this file have the following signature: 24 +--259 lines: @ typedef void (vp9_mc_func)(uint8_t dst, ptrdiff_t dst_stride,---------------------------------------------------------------------------------------------------- 283 @ postincrement 284 .if \size >= 16 285 sub r3, r3, r5 286 sub r3, r3, #8 287 .endif 288 @ Load the filter vector 289 vld1.16 {q0}, [r12,:128] ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 290 1: 291 .if \size >= 16 292 mov r12, r5 293 .endif 294 @ Load src 295 .if \size >= 16 296 +--116 lines: vld1.8 {d18, d19, d20}, [r2]!------------------------------------------------------------------------------------------------------------------------------- 412 ldr r5, [sp, #68] 413 .else 414 ldr r4, [sp, #16] 415 ldr r5, [sp, #20] 416 .endif 417 movrelx r12, X(ff_vp9_subpel_filters), r6 418 add r12, r12, 256\offset 419 cmp r5, #8 420 add r12, r12, r5, lsl #4 421 mov r5, #\size 422 .if \size >= 16 423 bge \type\()_8tap_16h_34 424 b \type\()_8tap_16h_43 425 .else 426 bge \type\()_8tap_\size\()h_34 427 +--120 lines: b \type\()_8tap_\size\()h_43--------------------------------------------------------------------------------------------------------------------------- 547 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4) 548 @ and idx1 is the other one of them. 549 .macro do_8tap_8v type, idx1, idx2 550 function \type\()_8tap_8v_\idx1\idx2 551 sub r2, r2, r3, lsl #1 552 sub r2, r2, r3 553 vld1.16 {q0}, [r12, :128] ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 554 1: 555 mov r12, r4 556 557 loadl q5, q6, q7 558 loadl q8, q9, q10, q11 559 2: 560 +-- 57 lines: loadl q12, q13, q14, q15----------------------------------------------------------------------------------------------------------------------------------- 617 @ registers are rows 3 and 4. 618 @ This only is designed to work for 4 or 8 output lines. 619 .macro do_8tap_4v type, idx1, idx2 620 function \type\()_8tap_4v_\idx1\idx2 621 sub r2, r2, r3, lsl #1 622 sub r2, r2, r3 623 vld1.16 {q0}, [r12, :128] ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 624 625 vld1.32 {d2[]}, [r2], r3 626 vld1.32 {d3[]}, [r2], r3 627 vld1.32 {d4[]}, [r2], r3 628 vld1.32 {d5[]}, [r2], r3 629 vld1.32 {d6[]}, [r2], r3 630 +-- 57 lines: vld1.32 {d7[]}, [r2], r3----------------------------------------------------------------------------------------------------------------------------------- 687 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 688 push {r4-r5} 689 vpush {q4-q7} 690 ldr r4, [sp, #72] 691 movrelx r12, X(ff_vp9_subpel_filters), r5 692 ldr r5, [sp, #80] 693 add r12, r12, 256*\offset 694 add r12, r12, r5, lsl #4 695 cmp r5, #8 696 mov r5, #\size 697 .if \size >= 8 698 bge \type\()_8tap_8v_34 699 b \type\()_8tap_8v_43 700 .else 701 +-- 20 lines: bge \type\()_8tap_4v_34----------------------------------------------------------------------------------------------------------------------------------

  1 /*
  2  * Copyright (c) 2016 Google Inc.
  3  *
  4  * This file is part of FFmpeg.                                                                                                                                                
  5  *
  6  * FFmpeg is free software; you can redistribute it and/or                                                                                                                     
  7  * modify it under the terms of the GNU Lesser General Public
  8  * License as published by the Free Software Foundation; either
  9  * version 2.1 of the License, or (at your option) any later version.
 10  *
 11  * FFmpeg is distributed in the hope that it will be useful,                                                                                                                   
 12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14  * Lesser General Public License for more details.
 15  *
 16  * You should have received a copy of the GNU Lesser General Public
 17  * License along with FFmpeg; if not, write to the Free Software                                                                                                               
 18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 19  */
 20 
 21 #include "libavutil/arm/asm.S"
 22 
 23 @ All public functions in this file have the following signature:
 24 +--259 lines: @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,----------------------------------------------------------------------------------------------------
283         @ postincrement
284 .if \size >= 16
285         sub             r3,  r3,  r5
286         sub             r3,  r3,  #8
287 .endif
288         @ Load the filter vector
289         vld1.16         {q0},  [r12,:128]                                                                                                                                      
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
290 1:
291 .if \size >= 16
292         mov             r12, r5
293 .endif
294         @ Load src
295 .if \size >= 16
296 +--116 lines: vld1.8          {d18, d19, d20}, [r2]!-------------------------------------------------------------------------------------------------------------------------------
412         ldr             r5,  [sp, #68]
413 .else
414         ldr             r4,  [sp, #16]
415         ldr             r5,  [sp, #20]
416 .endif
417         movrelx         r12, X(ff_vp9_subpel_filters), r6
418         add             r12, r12, 256*\offset                                                                                                                                  
419         cmp             r5,  #8
420         add             r12, r12, r5, lsl #4                                                                                                                                   
421         mov             r5,  #\size
422 .if \size >= 16
423         bge             \type\()_8tap_16h_34
424         b               \type\()_8tap_16h_43
425 .else
426         bge             \type\()_8tap_\size\()h_34
427 +--120 lines: b               \type\()_8tap_\size\()h_43---------------------------------------------------------------------------------------------------------------------------
547 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
548 @ and idx1 is the other one of them.
549 .macro do_8tap_8v type, idx1, idx2
550 function \type\()_8tap_8v_\idx1\idx2
551         sub             r2,  r2,  r3, lsl #1
552         sub             r2,  r2,  r3
553         vld1.16         {q0},  [r12, :128]                                                                                                                                     
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
554 1:
555         mov             r12, r4
556 
557         loadl           q5,  q6,  q7
558         loadl           q8,  q9,  q10, q11
559 2:
560 +-- 57 lines: loadl           q12, q13, q14, q15-----------------------------------------------------------------------------------------------------------------------------------
617 @ registers are rows 3 and 4.
618 @ This only is designed to work for 4 or 8 output lines.
619 .macro do_8tap_4v type, idx1, idx2
620 function \type\()_8tap_4v_\idx1\idx2
621         sub             r2,  r2,  r3, lsl #1
622         sub             r2,  r2,  r3
623         vld1.16         {q0},  [r12, :128]                                                                                                                                     
    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
624 
625         vld1.32         {d2[]},   [r2], r3
626         vld1.32         {d3[]},   [r2], r3
627         vld1.32         {d4[]},   [r2], r3
628         vld1.32         {d5[]},   [r2], r3
629         vld1.32         {d6[]},   [r2], r3
630 +-- 57 lines: vld1.32         {d7[]},   [r2], r3-----------------------------------------------------------------------------------------------------------------------------------
687 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
688         push            {r4-r5}
689         vpush           {q4-q7}
690         ldr             r4,  [sp, #72]
691         movrelx         r12, X(ff_vp9_subpel_filters), r5
692         ldr             r5,  [sp, #80]
693         add             r12, r12, 256*\offset                                                                                                                                  
694         add             r12, r12, r5, lsl #4                                                                                                                                   
695         cmp             r5,  #8
696         mov             r5,  #\size
697 .if \size >= 8
698         bge             \type\()_8tap_8v_34
699         b               \type\()_8tap_8v_43
700 .else
701 +-- 20 lines: bge             \type\()_8tap_4v_34----------------------------------------------------------------------------------------------------------------------------------