aarch64: h264 qpel NEON optimizations

Ported from ARMv7 NEON.
pull/53/head
Janne Grunau 11 years ago
parent 8438b3f09f
commit d5dd8c7bf0
  1. 2
      libavcodec/aarch64/Makefile
  2. 172
      libavcodec/aarch64/h264qpel_init_aarch64.c
  3. 934
      libavcodec/aarch64/h264qpel_neon.S
  4. 64
      libavcodec/aarch64/neon.S
  5. 2
      libavcodec/h264qpel.c
  6. 1
      libavcodec/h264qpel.h

@ -1,7 +1,9 @@
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o

@ -0,0 +1,172 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/h264qpel.h"
void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
/* c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; */
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
/* c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; */
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
/* c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; */
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
/* c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; */
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
}
}

@ -0,0 +1,934 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
/* H.264 qpel MC */
.macro lowpass_const r
movz \r, #20, lsl #16
movk \r, #5
mov v6.S[0], \r
.endm
//trashes v0-v5
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3
uaddl v2.8H, v2.8B, v3.8B
ext v4.8B, \r0\().8B, \r1\().8B, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4
uaddl v4.8H, v4.8B, v5.8B
ext v1.8B, \r0\().8B, \r1\().8B, #5
uaddl \d0\().8H, \r0\().8B, v1.8B
ext v0.8B, \r2\().8B, \r3\().8B, #2
mla \d0\().8H, v2.8H, v6.H[1]
ext v1.8B, \r2\().8B, \r3\().8B, #3
uaddl v0.8H, v0.8B, v1.8B
ext v1.8B, \r2\().8B, \r3\().8B, #1
mls \d0\().8H, v4.8H, v6.H[0]
ext v3.8B, \r2\().8B, \r3\().8B, #4
uaddl v1.8H, v1.8B, v3.8B
ext v2.8B, \r2\().8B, \r3\().8B, #5
uaddl \d1\().8H, \r2\().8B, v2.8B
mla \d1\().8H, v0.8H, v6.H[1]
mls \d1\().8H, v1.8H, v6.H[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
sqrshrun \d1\().8B, \d1\().8H, #5
.endif
.endm
//trashes v0-v5, v7, v30-v31
.macro lowpass_8H r0, r1
ext v0.16B, \r0\().16B, \r0\().16B, #2
ext v1.16B, \r0\().16B, \r0\().16B, #3
uaddl v0.8H, v0.8B, v1.8B
ext v2.16B, \r0\().16B, \r0\().16B, #1
ext v3.16B, \r0\().16B, \r0\().16B, #4
uaddl v2.8H, v2.8B, v3.8B
ext v30.16B, \r0\().16B, \r0\().16B, #5
uaddl \r0\().8H, \r0\().8B, v30.8B
ext v4.16B, \r1\().16B, \r1\().16B, #2
mla \r0\().8H, v0.8H, v6.H[1]
ext v5.16B, \r1\().16B, \r1\().16B, #3
uaddl v4.8H, v4.8B, v5.8B
ext v7.16B, \r1\().16B, \r1\().16B, #1
mls \r0\().8H, v2.8H, v6.H[0]
ext v0.16B, \r1\().16B, \r1\().16B, #4
uaddl v7.8H, v7.8B, v0.8B
ext v31.16B, \r1\().16B, \r1\().16B, #5
uaddl \r1\().8H, \r1\().8B, v31.8B
mla \r1\().8H, v4.8H, v6.H[1]
mls \r1\().8H, v7.8H, v6.H[0]
.endm
// trashes v2-v5, v30
.macro lowpass_8_1 r0, r1, d0, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3
uaddl v2.8H, v2.8B, v3.8B
ext v4.8B, \r0\().8B, \r1\().8B, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4
uaddl v4.8H, v4.8B, v5.8B
ext v30.8B, \r0\().8B, \r1\().8B, #5
uaddl \d0\().8H, \r0\().8B, v30.8B
mla \d0\().8H, v2.8H, v6.H[1]
mls \d0\().8H, v4.8H, v6.H[0]
.if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5
.endif
.endm
// trashed v0-v7
.macro lowpass_8.16 r0, r1, r2
ext v1.16B, \r0\().16B, \r1\().16B, #4
ext v0.16B, \r0\().16B, \r1\().16B, #6
saddl v5.4S, v1.4H, v0.4H
ext v2.16B, \r0\().16B, \r1\().16B, #2
saddl2 v1.4S, v1.8H, v0.8H
ext v3.16B, \r0\().16B, \r1\().16B, #8
saddl v6.4S, v2.4H, v3.4H
ext \r1\().16B, \r0\().16B, \r1\().16B, #10
saddl2 v2.4S, v2.8H, v3.8H
saddl v0.4S, \r0\().4H, \r1\().4H
saddl2 v4.4S, \r0\().8H, \r1\().8H
shl v3.4S, v5.4S, #4
shl v5.4S, v5.4S, #2
shl v7.4S, v6.4S, #2
add v5.4S, v5.4S, v3.4S
add v6.4S, v6.4S, v7.4S
shl v3.4S, v1.4S, #4
shl v1.4S, v1.4S, #2
shl v7.4S, v2.4S, #2
add v1.4S, v1.4S, v3.4S
add v2.4S, v2.4S, v7.4S
add v5.4S, v5.4S, v0.4S
sub v5.4S, v5.4S, v6.4S
add v1.4S, v1.4S, v4.4S
sub v1.4S, v1.4S, v2.4S
rshrn v5.4H, v5.4S, #10
rshrn2 v5.8H, v1.4S, #10
sqxtun \r2\().8B, v5.8H
.endm
function put_h264_qpel16_h_lowpass_neon_packed
mov x4, x30
mov x12, #16
mov x3, #8
bl put_h264_qpel8_h_lowpass_neon
sub x1, x1, x2, lsl #4
add x1, x1, #8
mov x12, #16
mov x30, x4
b put_h264_qpel8_h_lowpass_neon
endfunc
.macro h264_qpel_h_lowpass type
function \type\()_h264_qpel16_h_lowpass_neon
mov x13, x30
mov x12, #16
bl \type\()_h264_qpel8_h_lowpass_neon
sub x0, x0, x3, lsl #4
sub x1, x1, x2, lsl #4
add x0, x0, #8
add x1, x1, #8
mov x12, #16
mov x30, x13
endfunc
function \type\()_h264_qpel8_h_lowpass_neon
1: ld1 {v28.8B, v29.8B}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2
subs x12, x12, #2
lowpass_8 v28, v29, v16, v17, v28, v16
.ifc \type,avg
ld1 {v2.8B}, [x0], x3
urhadd v28.8B, v28.8B, v2.8B
ld1 {v3.8B}, [x0]
urhadd v16.8B, v16.8B, v3.8B
sub x0, x0, x3
.endif
st1 {v28.8B}, [x0], x3
st1 {v16.8B}, [x0], x3
b.ne 1b
ret
endfunc
.endm
h264_qpel_h_lowpass put
h264_qpel_h_lowpass avg
.macro h264_qpel_h_lowpass_l2 type
function \type\()_h264_qpel16_h_lowpass_l2_neon
mov x13, x30
mov x12, #16
bl \type\()_h264_qpel8_h_lowpass_l2_neon
sub x0, x0, x2, lsl #4
sub x1, x1, x2, lsl #4
sub x3, x3, x2, lsl #4
add x0, x0, #8
add x1, x1, #8
add x3, x3, #8
mov x12, #16
mov x30, x13
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon
1: ld1 {v26.8B, v27.8B}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2
ld1 {v28.8B}, [x3], x2
ld1 {v29.8B}, [x3], x2
subs x12, x12, #2
lowpass_8 v26, v27, v16, v17, v26, v27
urhadd v26.8B, v26.8B, v28.8B
urhadd v27.8B, v27.8B, v29.8B
.ifc \type,avg
ld1 {v2.8B}, [x0], x2
urhadd v26.8B, v26.8B, v2.8B
ld1 {v3.8B}, [x0]
urhadd v27.8B, v27.8B, v3.8B
sub x0, x0, x2
.endif
st1 {v26.8B}, [x0], x2
st1 {v27.8B}, [x0], x2
b.ne 1b
ret
endfunc
.endm
h264_qpel_h_lowpass_l2 put
h264_qpel_h_lowpass_l2 avg
function put_h264_qpel16_v_lowpass_neon_packed
mov x4, x30
mov x2, #8
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
bl put_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
mov x30, x4
b put_h264_qpel8_v_lowpass_neon
endfunc
.macro h264_qpel_v_lowpass type
function \type\()_h264_qpel16_v_lowpass_neon
mov x4, x30
bl \type\()_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_neon
sub x0, x0, x2, lsl #4
add x0, x0, #8
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
bl \type\()_h264_qpel8_v_lowpass_neon
sub x1, x1, x3, lsl #2
mov x30, x4
endfunc
function \type\()_h264_qpel8_v_lowpass_neon
ld1 {v16.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v28.8B}, [x1], x3
ld1 {v30.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v25.8B}, [x1]
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
lowpass_8 v16, v17, v18, v19, v16, v17
lowpass_8 v20, v21, v22, v23, v18, v19
lowpass_8 v24, v25, v26, v27, v20, v21
lowpass_8 v28, v29, v30, v31, v22, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
.ifc \type,avg
ld1 {v24.8B}, [x0], x2
urhadd v16.8B, v16.8B, v24.8B
ld1 {v25.8B}, [x0], x2
urhadd v17.8B, v17.8B, v25.8B
ld1 {v26.8B}, [x0], x2
urhadd v18.8B, v18.8B, v26.8B
ld1 {v27.8B}, [x0], x2
urhadd v19.8B, v19.8B, v27.8B
ld1 {v28.8B}, [x0], x2
urhadd v20.8B, v20.8B, v28.8B
ld1 {v29.8B}, [x0], x2
urhadd v21.8B, v21.8B, v29.8B
ld1 {v30.8B}, [x0], x2
urhadd v22.8B, v22.8B, v30.8B
ld1 {v31.8B}, [x0], x2
urhadd v23.8B, v23.8B, v31.8B
sub x0, x0, x2, lsl #3
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v18.8B}, [x0], x2
st1 {v19.8B}, [x0], x2
st1 {v20.8B}, [x0], x2
st1 {v21.8B}, [x0], x2
st1 {v22.8B}, [x0], x2
st1 {v23.8B}, [x0], x2
ret
endfunc
.endm
h264_qpel_v_lowpass put
h264_qpel_v_lowpass avg
.macro h264_qpel_v_lowpass_l2 type
function \type\()_h264_qpel16_v_lowpass_l2_neon
mov x4, x30
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub x0, x0, x3, lsl #4
sub x12, x12, x2, lsl #4
add x0, x0, #8
add x12, x12, #8
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub x1, x1, x3, lsl #2
mov x30, x4
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
ld1 {v16.8B}, [x1], x3
ld1 {v18.8B}, [x1], x3
ld1 {v20.8B}, [x1], x3
ld1 {v22.8B}, [x1], x3
ld1 {v24.8B}, [x1], x3
ld1 {v26.8B}, [x1], x3
ld1 {v28.8B}, [x1], x3
ld1 {v30.8B}, [x1], x3
ld1 {v17.8B}, [x1], x3
ld1 {v19.8B}, [x1], x3
ld1 {v21.8B}, [x1], x3
ld1 {v23.8B}, [x1], x3
ld1 {v25.8B}, [x1]
transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1
transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1
lowpass_8 v16, v17, v18, v19, v16, v17
lowpass_8 v20, v21, v22, v23, v18, v19
lowpass_8 v24, v25, v26, v27, v20, v21
lowpass_8 v28, v29, v30, v31, v22, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
ld1 {v24.8B}, [x12], x2
ld1 {v25.8B}, [x12], x2
ld1 {v26.8B}, [x12], x2
ld1 {v27.8B}, [x12], x2
ld1 {v28.8B}, [x12], x2
urhadd v16.8B, v24.8B, v16.8B
urhadd v17.8B, v25.8B, v17.8B
ld1 {v29.8B}, [x12], x2
urhadd v18.8B, v26.8B, v18.8B
urhadd v19.8B, v27.8B, v19.8B
ld1 {v30.8B}, [x12], x2
urhadd v20.8B, v28.8B, v20.8B
urhadd v21.8B, v29.8B, v21.8B
ld1 {v31.8B}, [x12], x2
urhadd v22.8B, v30.8B, v22.8B
urhadd v23.8B, v31.8B, v23.8B
.ifc \type,avg
ld1 {v24.8B}, [x0], x3
urhadd v16.8B, v16.8B, v24.8B
ld1 {v25.8B}, [x0], x3
urhadd v17.8B, v17.8B, v25.8B
ld1 {v26.8B}, [x0], x3
urhadd v18.8B, v18.8B, v26.8B
ld1 {v27.8B}, [x0], x3
urhadd v19.8B, v19.8B, v27.8B
ld1 {v28.8B}, [x0], x3
urhadd v20.8B, v20.8B, v28.8B
ld1 {v29.8B}, [x0], x3
urhadd v21.8B, v21.8B, v29.8B
ld1 {v30.8B}, [x0], x3
urhadd v22.8B, v22.8B, v30.8B
ld1 {v31.8B}, [x0], x3
urhadd v23.8B, v23.8B, v31.8B
sub x0, x0, x3, lsl #3
.endif
st1 {v16.8B}, [x0], x3
st1 {v17.8B}, [x0], x3
st1 {v18.8B}, [x0], x3
st1 {v19.8B}, [x0], x3
st1 {v20.8B}, [x0], x3
st1 {v21.8B}, [x0], x3
st1 {v22.8B}, [x0], x3
st1 {v23.8B}, [x0], x3
ret
endfunc
.endm
h264_qpel_v_lowpass_l2 put
h264_qpel_v_lowpass_l2 avg
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const w12
ld1 {v16.8H}, [x1], x3
ld1 {v17.8H}, [x1], x3
ld1 {v18.8H}, [x1], x3
ld1 {v19.8H}, [x1], x3
ld1 {v20.8H}, [x1], x3
ld1 {v21.8H}, [x1], x3
ld1 {v22.8H}, [x1], x3
ld1 {v23.8H}, [x1], x3
ld1 {v24.8H}, [x1], x3
ld1 {v25.8H}, [x1], x3
ld1 {v26.8H}, [x1], x3
ld1 {v27.8H}, [x1], x3
ld1 {v28.8H}, [x1]
lowpass_8H v16, v17
lowpass_8H v18, v19
lowpass_8H v20, v21
lowpass_8H v22, v23
lowpass_8H v24, v25
lowpass_8H v26, v27
lowpass_8H v28, v29
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lowpass_8.16 v16, v24, v16
lowpass_8.16 v17, v25, v17
lowpass_8.16 v18, v26, v18
lowpass_8.16 v19, v27, v19
lowpass_8.16 v20, v28, v20
lowpass_8.16 v21, v29, v21
lowpass_8.16 v22, v30, v22
lowpass_8.16 v23, v31, v23
transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
ret
endfunc
.macro h264_qpel8_hv_lowpass type
function \type\()_h264_qpel8_hv_lowpass_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg
ld1 {v0.8B}, [x0], x2
urhadd v16.8B, v16.8B, v0.8B
ld1 {v1.8B}, [x0], x2
urhadd v17.8B, v17.8B, v1.8B
ld1 {v2.8B}, [x0], x2
urhadd v18.8B, v18.8B, v2.8B
ld1 {v3.8B}, [x0], x2
urhadd v19.8B, v19.8B, v3.8B
ld1 {v4.8B}, [x0], x2
urhadd v20.8B, v20.8B, v4.8B
ld1 {v5.8B}, [x0], x2
urhadd v21.8B, v21.8B, v5.8B
ld1 {v6.8B}, [x0], x2
urhadd v22.8B, v22.8B, v6.8B
ld1 {v7.8B}, [x0], x2
urhadd v23.8B, v23.8B, v7.8B
sub x0, x0, x2, lsl #3
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
st1 {v18.8B}, [x0], x2
st1 {v19.8B}, [x0], x2
st1 {v20.8B}, [x0], x2
st1 {v21.8B}, [x0], x2
st1 {v22.8B}, [x0], x2
st1 {v23.8B}, [x0], x2
ret x10
endfunc
.endm
h264_qpel8_hv_lowpass put
h264_qpel8_hv_lowpass avg
.macro h264_qpel8_hv_lowpass_l2 type
function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top
ld1 {v0.8B, v1.8B}, [x2], #16
ld1 {v2.8B, v3.8B}, [x2], #16
urhadd v0.8B, v0.8B, v16.8B
urhadd v1.8B, v1.8B, v17.8B
ld1 {v4.8B, v5.8B}, [x2], #16
urhadd v2.8B, v2.8B, v18.8B
urhadd v3.8B, v3.8B, v19.8B
ld1 {v6.8B, v7.8B}, [x2], #16
urhadd v4.8B, v4.8B, v20.8B
urhadd v5.8B, v5.8B, v21.8B
urhadd v6.8B, v6.8B, v22.8B
urhadd v7.8B, v7.8B, v23.8B
.ifc \type,avg
ld1 {v16.8B}, [x0], x3
urhadd v0.8B, v0.8B, v16.8B
ld1 {v17.8B}, [x0], x3
urhadd v1.8B, v1.8B, v17.8B
ld1 {v18.8B}, [x0], x3
urhadd v2.8B, v2.8B, v18.8B
ld1 {v19.8B}, [x0], x3
urhadd v3.8B, v3.8B, v19.8B
ld1 {v20.8B}, [x0], x3
urhadd v4.8B, v4.8B, v20.8B
ld1 {v21.8B}, [x0], x3
urhadd v5.8B, v5.8B, v21.8B
ld1 {v22.8B}, [x0], x3
urhadd v6.8B, v6.8B, v22.8B
ld1 {v23.8B}, [x0], x3
urhadd v7.8B, v7.8B, v23.8B
sub x0, x0, x3, lsl #3
.endif
st1 {v0.8B}, [x0], x3
st1 {v1.8B}, [x0], x3
st1 {v2.8B}, [x0], x3
st1 {v3.8B}, [x0], x3
st1 {v4.8B}, [x0], x3
st1 {v5.8B}, [x0], x3
st1 {v6.8B}, [x0], x3
st1 {v7.8B}, [x0], x3
ret x10
endfunc
.endm
h264_qpel8_hv_lowpass_l2 put
h264_qpel8_hv_lowpass_l2 avg
.macro h264_qpel16_hv type
function \type\()_h264_qpel16_hv_lowpass_neon
mov x13, x30
bl \type\()_h264_qpel8_hv_lowpass_neon
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_neon
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
sub x0, x0, x2, lsl #4
add x0, x0, #8
bl \type\()_h264_qpel8_hv_lowpass_neon
sub x1, x1, x3, lsl #2
mov x30, x13
b \type\()_h264_qpel8_hv_lowpass_neon
endfunc
function \type\()_h264_qpel16_hv_lowpass_l2_neon
mov x13, x30
sub x2, x4, #256
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub x1, x1, x3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub x1, x1, x3, lsl #4
sub x1, x1, x3, lsl #2
add x1, x1, #8
sub x0, x0, x3, lsl #4
add x0, x0, #8
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub x1, x1, x3, lsl #2
mov x30, x13
b \type\()_h264_qpel8_hv_lowpass_l2_neon
endfunc
.endm
h264_qpel16_hv put
h264_qpel16_hv avg
.macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1
lowpass_const w3
mov x3, x1
sub x1, x1, #2
mov x12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc20_neon, export=1
lowpass_const w3
sub x1, x1, #2
mov x3, x2
mov x12, #8
b \type\()_h264_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel8_mc30_neon, export=1
lowpass_const w3
add x3, x1, #1
sub x1, x1, #2
mov x12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc01_neon, export=1
mov x14, x30
mov x12, x1
\type\()_h264_qpel8_mc01:
lowpass_const w3
mov x3, x2
sub x1, x1, x2, lsl #1
bl \type\()_h264_qpel8_v_lowpass_l2_neon
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc11_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel8_mc11:
lowpass_const w3
mov x11, sp
sub sp, sp, #64
mov x0, sp
sub x1, x1, #2
mov x3, #8
mov x12, #8
bl put_h264_qpel8_h_lowpass_neon
mov x0, x8
mov x3, x2
mov x12, sp
sub x1, x9, x2, lsl #1
mov x2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc21_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel8_mc21:
lowpass_const w3
mov x11, sp
sub sp, sp, #(8*8+16*12)
sub x1, x1, #2
mov x3, #8
mov x0, sp
mov x12, #8
bl put_h264_qpel8_h_lowpass_neon
mov x4, x0
mov x0, x8
sub x1, x9, x2, lsl #1
sub x1, x1, #2
mov x3, x2
sub x2, x4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc31_neon, export=1
add x1, x1, #1
mov x14, x30
mov x8, x0
mov x9, x1
sub x1, x1, #1
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc02_neon, export=1
mov x14, x30
lowpass_const w3
sub x1, x1, x2, lsl #1
mov x3, x2
bl \type\()_h264_qpel8_v_lowpass_neon
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc12_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel8_mc12:
lowpass_const w3
mov x11, sp
sub sp, sp, #(8*8+16*12)
sub x1, x1, x2, lsl #1
mov x3, x2
mov x2, #8
mov x0, sp
bl put_h264_qpel8_v_lowpass_neon
mov x4, x0
mov x0, x8
sub x1, x9, x3, lsl #1
sub x1, x1, #2
sub x2, x4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
mov x14, x30
mov x11, sp
sub x1, x1, x2, lsl #1
sub x1, x1, #2
mov x3, x2
bl \type\()_h264_qpel8_hv_lowpass_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel8_mc32_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, #1
b \type\()_h264_qpel8_mc12
endfunc
function ff_\type\()_h264_qpel8_mc03_neon, export=1
mov x14, x30
add x12, x1, x2
b \type\()_h264_qpel8_mc01
endfunc
function ff_\type\()_h264_qpel8_mc13_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc23_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel8_mc21
endfunc
function ff_\type\()_h264_qpel8_mc33_neon, export=1
add x1, x1, #1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
sub x1, x1, #1
b \type\()_h264_qpel8_mc11
endfunc
.endm
h264_qpel8 put
h264_qpel8 avg
.macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1
lowpass_const w3
mov x3, x1
sub x1, x1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc20_neon, export=1
lowpass_const w3
sub x1, x1, #2
mov x3, x2
b \type\()_h264_qpel16_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel16_mc30_neon, export=1
lowpass_const w3
add x3, x1, #1
sub x1, x1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc01_neon, export=1
mov x14, x30
mov x12, x1
\type\()_h264_qpel16_mc01:
lowpass_const w3
mov x3, x2
sub x1, x1, x2, lsl #1
bl \type\()_h264_qpel16_v_lowpass_l2_neon
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc11_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel16_mc11:
lowpass_const w3
mov x11, sp
sub sp, sp, #256
mov x0, sp
sub x1, x1, #2
mov x3, #16
bl put_h264_qpel16_h_lowpass_neon
mov x0, x8
mov x3, x2
mov x12, sp
sub x1, x9, x2, lsl #1
mov x2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc21_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel16_mc21:
lowpass_const w3
mov x11, sp
sub sp, sp, #(16*16+16*12)
sub x1, x1, #2
mov x0, sp
bl put_h264_qpel16_h_lowpass_neon_packed
mov x4, x0
mov x0, x8
sub x1, x9, x2, lsl #1
sub x1, x1, #2
mov x3, x2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc31_neon, export=1
add x1, x1, #1
mov x14, x30
mov x8, x0
mov x9, x1
sub x1, x1, #1
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc02_neon, export=1
mov x14, x30
lowpass_const w3
sub x1, x1, x2, lsl #1
mov x3, x2
bl \type\()_h264_qpel16_v_lowpass_neon
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc12_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
\type\()_h264_qpel16_mc12:
lowpass_const w3
mov x11, sp
sub sp, sp, #(16*16+16*12)
sub x1, x1, x2, lsl #1
mov x0, sp
mov x3, x2
bl put_h264_qpel16_v_lowpass_neon_packed
mov x4, x0
mov x0, x8
sub x1, x9, x3, lsl #1
sub x1, x1, #2
mov x2, x3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
mov sp, x11
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc22_neon, export=1
mov x14, x30
lowpass_const w3
mov x11, sp
sub x1, x1, x2, lsl #1
sub x1, x1, #2
mov x3, x2
bl \type\()_h264_qpel16_hv_lowpass_neon
mov sp, x11 // restore stack
ret x14
endfunc
function ff_\type\()_h264_qpel16_mc32_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, #1
b \type\()_h264_qpel16_mc12
endfunc
function ff_\type\()_h264_qpel16_mc03_neon, export=1
mov x14, x30
add x12, x1, x2
b \type\()_h264_qpel16_mc01
endfunc
function ff_\type\()_h264_qpel16_mc13_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc23_neon, export=1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
b \type\()_h264_qpel16_mc21
endfunc
function ff_\type\()_h264_qpel16_mc33_neon, export=1
add x1, x1, #1
mov x14, x30
mov x8, x0
mov x9, x1
add x1, x1, x2
sub x1, x1, #1
b \type\()_h264_qpel16_mc11
endfunc
.endm
h264_qpel16 put
h264_qpel16 avg

@ -16,6 +16,70 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8B, \r0\().8B, \r1\().8B
trn2 \r9\().8B, \r0\().8B, \r1\().8B
trn1 \r1\().8B, \r2\().8B, \r3\().8B
trn2 \r3\().8B, \r2\().8B, \r3\().8B
trn1 \r0\().8B, \r4\().8B, \r5\().8B
trn2 \r5\().8B, \r4\().8B, \r5\().8B
trn1 \r2\().8B, \r6\().8B, \r7\().8B
trn2 \r7\().8B, \r6\().8B, \r7\().8B
trn1 \r4\().4H, \r0\().4H, \r2\().4H
trn2 \r2\().4H, \r0\().4H, \r2\().4H
trn1 \r6\().4H, \r5\().4H, \r7\().4H
trn2 \r7\().4H, \r5\().4H, \r7\().4H
trn1 \r5\().4H, \r9\().4H, \r3\().4H
trn2 \r9\().4H, \r9\().4H, \r3\().4H
trn1 \r3\().4H, \r8\().4H, \r1\().4H
trn2 \r8\().4H, \r8\().4H, \r1\().4H
trn1 \r0\().2S, \r3\().2S, \r4\().2S
trn2 \r4\().2S, \r3\().2S, \r4\().2S
trn1 \r1\().2S, \r5\().2S, \r6\().2S
trn2 \r5\().2S, \r5\().2S, \r6\().2S
trn2 \r6\().2S, \r8\().2S, \r2\().2S
trn1 \r2\().2S, \r8\().2S, \r2\().2S
trn1 \r3\().2S, \r9\().2S, \r7\().2S
trn2 \r7\().2S, \r9\().2S, \r7\().2S
.endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16B, \r0\().16B, \r1\().16B
trn2 \t1\().16B, \r0\().16B, \r1\().16B
trn1 \r1\().16B, \r2\().16B, \r3\().16B
trn2 \r3\().16B, \r2\().16B, \r3\().16B
trn1 \r0\().16B, \r4\().16B, \r5\().16B
trn2 \r5\().16B, \r4\().16B, \r5\().16B
trn1 \r2\().16B, \r6\().16B, \r7\().16B
trn2 \r7\().16B, \r6\().16B, \r7\().16B
trn1 \r4\().8H, \r0\().8H, \r2\().8H
trn2 \r2\().8H, \r0\().8H, \r2\().8H
trn1 \r6\().8H, \r5\().8H, \r7\().8H
trn2 \r7\().8H, \r5\().8H, \r7\().8H
trn1 \r5\().8H, \t1\().8H, \r3\().8H
trn2 \t1\().8H, \t1\().8H, \r3\().8H
trn1 \r3\().8H, \t0\().8H, \r1\().8H
trn2 \t0\().8H, \t0\().8H, \r1\().8H
trn1 \r0\().4S, \r3\().4S, \r4\().4S
trn2 \r4\().4S, \r3\().4S, \r4\().4S
trn1 \r1\().4S, \r5\().4S, \r6\().4S
trn2 \r5\().4S, \r5\().4S, \r6\().4S
trn2 \r6\().4S, \t0\().4S, \r2\().4S
trn1 \r2\().4S, \t0\().4S, \r2\().4S
trn1 \r3\().4S, \t1\().4S, \r7\().4S
trn2 \r7\().4S, \t1\().4S, \r7\().4S
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 .macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H trn2 \r5\().4H, \r0\().4H, \r1\().4H

@ -78,6 +78,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
break; break;
} }
if (ARCH_AARCH64)
ff_h264qpel_init_aarch64(c, bit_depth);
if (ARCH_ARM) if (ARCH_ARM)
ff_h264qpel_init_arm(c, bit_depth); ff_h264qpel_init_arm(c, bit_depth);
if (ARCH_PPC) if (ARCH_PPC)

@ -31,6 +31,7 @@ typedef struct H264QpelContext {
void ff_h264qpel_init(H264QpelContext *c, int bit_depth); void ff_h264qpel_init(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);

Loading…
Cancel
Save