H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.
pull/2/head
Ronald S. Bultje 13 years ago
parent 229d263cc9
commit c2d337429c
  1. 77
      libavcodec/arm/h264dsp_init_arm.c
  2. 86
      libavcodec/arm/h264dsp_neon.S
  3. 126
      libavcodec/h264.c
  4. 28
      libavcodec/h264dsp.c
  5. 10
      libavcodec/h264dsp.h
  6. 28
      libavcodec/h264dsp_template.c
  7. 44
      libavcodec/ppc/h264_altivec.c
  8. 210
      libavcodec/x86/h264_weight.asm
  9. 145
      libavcodec/x86/h264_weight_10bit.asm
  10. 175
      libavcodec/x86/h264dsp_mmx.c

@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int weight, int offset); int log2_den, int weight, int offset);
void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
int weight, int offset); int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
int weight, int offset); int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights, int height, int log2_den, int weightd,
int offset); int weights, int offset);
void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights, int height, int log2_den, int weightd,
int offset); int weights, int offset);
void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights, int height, int log2_den, int weightd,
int offset); int weights, int offset);
void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
c->h264_idct_add = ff_h264_idct_add_neon; c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;

@ -1592,7 +1592,7 @@ endfunc
vdup.8 d1, r5 vdup.8 d1, r5
vmov q2, q8 vmov q2, q8
vmov q3, q8 vmov q3, q8
1: subs ip, ip, #2 1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2 vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20 \macd q2, d0, d20
pld [r0] pld [r0]
@ -1632,7 +1632,7 @@ endfunc
vdup.8 d1, r5 vdup.8 d1, r5
vmov q1, q8 vmov q1, q8
vmov q10, q8 vmov q10, q8
1: subs ip, ip, #2 1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2 vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4 \macd q1, d0, d4
pld [r0] pld [r0]
@ -1662,7 +1662,7 @@ endfunc
vdup.8 d1, r5 vdup.8 d1, r5
vmov q1, q8 vmov q1, q8
vmov q10, q8 vmov q10, q8
1: subs ip, ip, #4 1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2 vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2 vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4 \macd q1, d0, d4
@ -1700,16 +1700,17 @@ endfunc
.endm .endm
.macro biweight_func w .macro biweight_func w
function biweight_h264_pixels_\w\()_neon function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr} push {r4-r6, lr}
add r4, sp, #16 ldr r12, [sp, #16]
add r4, sp, #20
ldm r4, {r4-r6} ldm r4, {r4-r6}
lsr lr, r4, #31 lsr lr, r4, #31
add r6, r6, #1 add r6, r6, #1
eors lr, lr, r5, lsr #30 eors lr, lr, r5, lsr #30
orr r6, r6, #1 orr r6, r6, #1
vdup.16 q9, r3 vdup.16 q9, r12
lsl r6, r6, r3 lsl r6, r6, r12
vmvn q9, q9 vmvn q9, q9
vdup.16 q8, r6 vdup.16 q8, r6
mov r6, r0 mov r6, r0
@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
endfunc endfunc
.endm .endm
.macro biweight_entry w, h, b=1
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
mov ip, #\h
.if \b
b biweight_h264_pixels_\w\()_neon
.endif
endfunc
.endm
biweight_entry 16, 8
biweight_entry 16, 16, b=0
biweight_func 16 biweight_func 16
biweight_entry 8, 16
biweight_entry 8, 4
biweight_entry 8, 8, b=0
biweight_func 8 biweight_func 8
biweight_entry 4, 8
biweight_entry 4, 2
biweight_entry 4, 4, b=0
biweight_func 4 biweight_func 4
@ Weighted prediction @ Weighted prediction
.macro weight_16 add .macro weight_16 add
vdup.8 d0, r3 vdup.8 d0, r12
1: subs ip, ip, #2 1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1 vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20 vmull.u8 q2, d0, d20
pld [r0] pld [r0]
@ -1785,8 +1767,8 @@ endfunc
.endm .endm
.macro weight_8 add .macro weight_8 add
vdup.8 d0, r3 vdup.8 d0, r12
1: subs ip, ip, #2 1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1 vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4 vmull.u8 q1, d0, d4
pld [r0] pld [r0]
@ -1806,10 +1788,10 @@ endfunc
.endm .endm
.macro weight_4 add .macro weight_4 add
vdup.8 d0, r3 vdup.8 d0, r12
vmov q1, q8 vmov q1, q8
vmov q10, q8 vmov q10, q8
1: subs ip, ip, #4 1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1 vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1 vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4 vmull.u8 q1, d0, d4
@ -1842,50 +1824,32 @@ endfunc
.endm .endm
.macro weight_func w .macro weight_func w
function weight_h264_pixels_\w\()_neon function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr} push {r4, lr}
ldr r4, [sp, #8] ldr r12, [sp, #8]
cmp r2, #1 ldr r4, [sp, #12]
lsl r4, r4, r2 cmp r3, #1
lsl r4, r4, r3
vdup.16 q8, r4 vdup.16 q8, r4
mov r4, r0 mov r4, r0
ble 20f ble 20f
rsb lr, r2, #1 rsb lr, r3, #1
vdup.16 q9, lr vdup.16 q9, lr
cmp r3, #0 cmp r12, #0
blt 10f blt 10f
weight_\w vhadd.s16 weight_\w vhadd.s16
10: rsb r3, r3, #0 10: rsb r12, r12, #0
weight_\w vhsub.s16 weight_\w vhsub.s16
20: rsb lr, r2, #0 20: rsb lr, r3, #0
vdup.16 q9, lr vdup.16 q9, lr
cmp r3, #0 cmp r12, #0
blt 10f blt 10f
weight_\w vadd.s16 weight_\w vadd.s16
10: rsb r3, r3, #0 10: rsb r12, r12, #0
weight_\w vsub.s16 weight_\w vsub.s16
endfunc endfunc
.endm .endm
.macro weight_entry w, h, b=1
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
mov ip, #\h
.if \b
b weight_h264_pixels_\w\()_neon
.endif
endfunc
.endm
weight_entry 16, 8
weight_entry 16, 16, b=0
weight_func 16 weight_func 16
weight_entry 8, 16
weight_entry 8, 4
weight_entry 8, 8, b=0
weight_func 8 weight_func 8
weight_entry 4, 8
weight_entry 4, 2
weight_entry 4, 4, b=0
weight_func 4 weight_func 4

@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
} }
#endif #endif
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
int height, int delta, int list,
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
int src_x_offset, int src_y_offset, int src_x_offset, int src_y_offset,
qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
src_cb= s->edge_emu_buffer; src_cb= s->edge_emu_buffer;
} }
chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
if(emu){ if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
src_cr= s->edge_emu_buffer; src_cr= s->edge_emu_buffer;
} }
chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
} }
static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
int x_offset, int y_offset, int x_offset, int y_offset,
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
if(list0){ if(list0){
Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
mc_dir_part(h, ref, n, square, chroma_height, delta, 0, mc_dir_part(h, ref, n, square, height, delta, 0,
dest_y, dest_cb, dest_cr, x_offset, y_offset, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_op, chroma_op, pixel_shift, chroma444); qpix_op, chroma_op, pixel_shift, chroma444);
@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
if(list1){ if(list1){
Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
mc_dir_part(h, ref, n, square, chroma_height, delta, 1, mc_dir_part(h, ref, n, square, height, delta, 1,
dest_y, dest_cb, dest_cr, x_offset, y_offset, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_op, chroma_op, pixel_shift, chroma444); qpix_op, chroma_op, pixel_shift, chroma444);
} }
} }
static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
int x_offset, int y_offset, int x_offset, int y_offset,
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
int list0, int list1, int pixel_shift, int chroma444){ int list0, int list1, int pixel_shift, int chroma444){
MpegEncContext * const s = &h->s; MpegEncContext * const s = &h->s;
int chroma_height;
dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
if(chroma444){ if(chroma444){
chroma_height = height;
chroma_weight_avg = luma_weight_avg; chroma_weight_avg = luma_weight_avg;
chroma_weight_op = luma_weight_op; chroma_weight_op = luma_weight_op;
dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
} else if (CHROMA422) { } else if (CHROMA422) {
chroma_height = height;
dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
}else{ }else{
chroma_height = height >> 1;
dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
} }
@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
int refn0 = h->ref_cache[0][ scan8[n] ]; int refn0 = h->ref_cache[0][ scan8[n] ];
int refn1 = h->ref_cache[1][ scan8[n] ]; int refn1 = h->ref_cache[1][ scan8[n] ];
mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
dest_y, dest_cb, dest_cr, dest_y, dest_cb, dest_cr,
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
tmp_y, tmp_cb, tmp_cr, tmp_y, tmp_cb, tmp_cr,
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
if(h->use_weight == 2){ if(h->use_weight == 2){
int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
int weight1 = 64 - weight0; int weight1 = 64 - weight0;
luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); luma_weight_avg( dest_y, tmp_y, h-> mb_linesize,
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); height, 5, weight0, weight1, 0);
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
if (CHROMA422) { chroma_height, 5, weight0, weight1, 0);
chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
tmp_cb + chroma_height * h->mb_uvlinesize, chroma_height, 5, weight0, weight1, 0);
h->mb_uvlinesize, 5, weight0, weight1, 0);
chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
tmp_cr + chroma_height * h->mb_uvlinesize,
h->mb_uvlinesize, 5, weight0, weight1, 0);
}
}else{ }else{
luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
if (CHROMA422) {
chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
tmp_cb + chroma_height * h->mb_uvlinesize,
h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
tmp_cr + chroma_height * h->mb_uvlinesize,
h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
} }
}
}else{ }else{
int list = list1 ? 1 : 0; int list = list1 ? 1 : 0;
int refn = h->ref_cache[list][ scan8[n] ]; int refn = h->ref_cache[list][ scan8[n] ];
Picture *ref= &h->ref_list[list][refn]; Picture *ref= &h->ref_list[list][refn];
mc_dir_part(h, ref, n, square, chroma_height, delta, list, mc_dir_part(h, ref, n, square, height, delta, list,
dest_y, dest_cb, dest_cr, x_offset, y_offset, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put, chroma_put, pixel_shift, chroma444); qpix_put, chroma_put, pixel_shift, chroma444);
luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
if(h->use_weight_chroma){ if(h->use_weight_chroma){
chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
if (CHROMA422) {
chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->mb_uvlinesize, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
} }
} }
} }
}
static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
int x_offset, int y_offset, int x_offset, int y_offset,
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
if((h->use_weight==2 && list0 && list1 if((h->use_weight==2 && list0 && list1
&& (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
|| h->use_weight==1) || h->use_weight==1)
mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
x_offset, y_offset, qpix_put, chroma_put, x_offset, y_offset, qpix_put, chroma_put,
weight_op[0], weight_op[3], weight_avg[0], weight_op[0], weight_op[1], weight_avg[0],
weight_avg[3], list0, list1, pixel_shift, chroma444); weight_avg[1], list0, list1, pixel_shift, chroma444);
else else
mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
x_offset, y_offset, qpix_put, chroma_put, qpix_avg, x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
chroma_avg, list0, list1, pixel_shift, chroma444); chroma_avg, list0, list1, pixel_shift, chroma444);
} }
@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
prefetch_motion(h, 0, pixel_shift, chroma444); prefetch_motion(h, 0, pixel_shift, chroma444);
if(IS_16X16(mb_type)){ if(IS_16X16(mb_type)){
mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
weight_op, weight_avg, weight_op, weight_avg,
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else if(IS_16X8(mb_type)){ }else if(IS_16X8(mb_type)){
mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
&weight_op[1], &weight_avg[1], weight_op, weight_avg,
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
&weight_op[1], &weight_avg[1], weight_op, weight_avg,
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else if(IS_8X16(mb_type)){ }else if(IS_8X16(mb_type)){
mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
&weight_op[2], &weight_avg[2], &weight_op[1], &weight_avg[1],
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
&weight_op[2], &weight_avg[2], &weight_op[1], &weight_avg[1],
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else{ }else{
@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
int y_offset= (i&2)<<1; int y_offset= (i&2)<<1;
if(IS_SUB_8X8(sub_mb_type)){ if(IS_SUB_8X8(sub_mb_type)){
mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
&weight_op[3], &weight_avg[3], &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else if(IS_SUB_8X4(sub_mb_type)){ }else if(IS_SUB_8X4(sub_mb_type)){
mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
&weight_op[4], &weight_avg[4], &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
&weight_op[4], &weight_avg[4], &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else if(IS_SUB_4X8(sub_mb_type)){ }else if(IS_SUB_4X8(sub_mb_type)){
mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
&weight_op[5], &weight_avg[5], &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
&weight_op[5], &weight_avg[5], &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
}else{ }else{
@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
for(j=0; j<4; j++){ for(j=0; j<4; j++){
int sub_x_offset= x_offset + 2*(j&1); int sub_x_offset= x_offset + 2*(j&1);
int sub_y_offset= y_offset + (j&2); int sub_y_offset= y_offset + (j&2);
mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
&weight_op[6], &weight_avg[6], &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
pixel_shift, chroma444); pixel_shift, chroma444);
} }

@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
else\ else\
c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
\ \
c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
\ \
c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\

@ -31,16 +31,18 @@
#include "dsputil.h" #include "dsputil.h"
//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); int log2_denom, int weight, int offset);
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
int log2_denom, int weightd, int weights, int offset);
/** /**
* Context for storing H.264 DSP functions * Context for storing H.264 DSP functions
*/ */
typedef struct H264DSPContext{ typedef struct H264DSPContext{
/* weighted MC */ /* weighted MC */
h264_weight_func weight_h264_pixels_tab[10]; h264_weight_func weight_h264_pixels_tab[4];
h264_biweight_func biweight_h264_pixels_tab[10]; h264_biweight_func biweight_h264_pixels_tab[4];
/* loop filter */ /* loop filter */
void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);

@ -29,14 +29,16 @@
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
#define H264_WEIGHT(W,H) \ #define H264_WEIGHT(W) \
static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
int log2_denom, int weight, int offset) \
{ \
int y; \ int y; \
pixel *block = (pixel*)_block; \ pixel *block = (pixel*)_block; \
stride /= sizeof(pixel); \ stride /= sizeof(pixel); \
offset <<= (log2_denom + (BIT_DEPTH-8)); \ offset <<= (log2_denom + (BIT_DEPTH-8)); \
if(log2_denom) offset += 1<<(log2_denom-1); \ if(log2_denom) offset += 1<<(log2_denom-1); \
for(y=0; y<H; y++, block += stride){ \ for (y = 0; y < height; y++, block += stride) { \
op_scale1(0); \ op_scale1(0); \
op_scale1(1); \ op_scale1(1); \
if(W==2) continue; \ if(W==2) continue; \
@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
op_scale1(15); \ op_scale1(15); \
} \ } \
} \ } \
static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \ static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
int log2_denom, int weightd, int weights, int offset) \
{ \
int y; \ int y; \
pixel *dst = (pixel*)_dst; \ pixel *dst = (pixel*)_dst; \
pixel *src = (pixel*)_src; \ pixel *src = (pixel*)_src; \
stride /= sizeof(pixel); \ stride /= sizeof(pixel); \
offset <<= (BIT_DEPTH-8); \ offset <<= (BIT_DEPTH-8); \
offset = ((offset + 1) | 1) << log2_denom; \ offset = ((offset + 1) | 1) << log2_denom; \
for(y=0; y<H; y++, dst += stride, src += stride){ \ for (y = 0; y < height; y++, dst += stride, src += stride) { \
op_scale2(0); \ op_scale2(0); \
op_scale2(1); \ op_scale2(1); \
if(W==2) continue; \ if(W==2) continue; \
@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
} \ } \
} }
H264_WEIGHT(16,16) H264_WEIGHT(16)
H264_WEIGHT(16,8) H264_WEIGHT(8)
H264_WEIGHT(8,16) H264_WEIGHT(4)
H264_WEIGHT(8,8) H264_WEIGHT(2)
H264_WEIGHT(8,4)
H264_WEIGHT(4,8)
H264_WEIGHT(4,4)
H264_WEIGHT(4,2)
H264_WEIGHT(2,4)
H264_WEIGHT(2,2)
#undef op_scale1 #undef op_scale1
#undef op_scale2 #undef op_scale2

@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
} }
static av_always_inline static av_always_inline
void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) void weight_h264_W_altivec(uint8_t *block, int stride, int height,
int log2_denom, int weight, int offset, int w)
{ {
int y, aligned; int y, aligned;
vec_u8 vblock; vec_u8 vblock;
@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
voffset = vec_splat(vtemp, 5); voffset = vec_splat(vtemp, 5);
aligned = !((unsigned long)block & 0xf); aligned = !((unsigned long)block & 0xf);
for (y=0; y<h; y++) { for (y = 0; y < height; y++) {
vblock = vec_ld(0, block); vblock = vec_ld(0, block);
v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
} }
static av_always_inline static av_always_inline
void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
int weightd, int weights, int offset, int w, int h) int log2_denom, int weightd, int weights, int offset, int w)
{ {
int y, dst_aligned, src_aligned; int y, dst_aligned, src_aligned;
vec_u8 vsrc, vdst; vec_u8 vsrc, vdst;
@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
dst_aligned = !((unsigned long)dst & 0xf); dst_aligned = !((unsigned long)dst & 0xf);
src_aligned = !((unsigned long)src & 0xf); src_aligned = !((unsigned long)src & 0xf);
for (y=0; y<h; y++) { for (y = 0; y < height; y++) {
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
vsrc = vec_ld(0, src); vsrc = vec_ld(0, src);
@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
} }
} }
#define H264_WEIGHT(W,H) \ #define H264_WEIGHT(W) \
static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ int log2_denom, int weight, int offset){ \
weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
}\ }\
static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ int log2_denom, int weightd, int weights, int offset){ \
biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
} }
H264_WEIGHT(16,16) H264_WEIGHT(16)
H264_WEIGHT(16, 8) H264_WEIGHT( 8)
H264_WEIGHT( 8,16)
H264_WEIGHT( 8, 8)
H264_WEIGHT( 8, 4)
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
const int high_bit_depth = avctx->bits_per_raw_sample > 8; const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
} }
} }
} }

@ -28,21 +28,20 @@ SECTION .text
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; biweight pred: ; biweight pred:
; ;
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int log2_denom, int weightd, int weights, ; int height, int log2_denom, int weightd,
; int offset); ; int weights, int offset);
; and ; and
; void h264_weight_16x16_sse2(uint8_t *dst, int stride, ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, ; int log2_denom, int weight, int offset);
; int offset);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro WEIGHT_SETUP 0 %macro WEIGHT_SETUP 0
add r4, r4 add r5, r5
inc r4 inc r5
movd m3, r3d movd m3, r4d
movd m5, r4d movd m5, r5d
movd m6, r2d movd m6, r3d
pslld m5, m6 pslld m5, m6
psrld m5, 1 psrld m5, 1
%if mmsize == 16 %if mmsize == 16
@ -71,60 +70,41 @@ SECTION .text
packuswb m0, m1 packuswb m0, m1
%endmacro %endmacro
%macro WEIGHT_FUNC_DBL_MM 1 INIT_MMX
cglobal h264_weight_16x%1_mmx2, 5, 5, 0 cglobal h264_weight_16_mmx2, 6, 6, 0
WEIGHT_SETUP WEIGHT_SETUP
mov r2, %1
%if %1 == 16
.nextrow .nextrow
WEIGHT_OP 0, 4 WEIGHT_OP 0, 4
mova [r0 ], m0 mova [r0 ], m0
WEIGHT_OP 8, 12 WEIGHT_OP 8, 12
mova [r0+8], m0 mova [r0+8], m0
add r0, r1 add r0, r1
dec r2 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
%endif
%endmacro
INIT_MMX %macro WEIGHT_FUNC_MM 3
WEIGHT_FUNC_DBL_MM 16 cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_FUNC_DBL_MM 8
%macro WEIGHT_FUNC_MM 4
cglobal h264_weight_%1x%2_%4, 7, 7, %3
WEIGHT_SETUP WEIGHT_SETUP
mov r2, %2
%if %2 == 16
.nextrow .nextrow
WEIGHT_OP 0, mmsize/2 WEIGHT_OP 0, mmsize/2
mova [r0], m0 mova [r0], m0
add r0, r1 add r0, r1
dec r2 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
%endif
%endmacro %endmacro
INIT_MMX INIT_MMX
WEIGHT_FUNC_MM 8, 16, 0, mmx2 WEIGHT_FUNC_MM 8, 0, mmx2
WEIGHT_FUNC_MM 8, 8, 0, mmx2
WEIGHT_FUNC_MM 8, 4, 0, mmx2
INIT_XMM INIT_XMM
WEIGHT_FUNC_MM 16, 16, 8, sse2 WEIGHT_FUNC_MM 16, 8, sse2
WEIGHT_FUNC_MM 16, 8, 8, sse2
%macro WEIGHT_FUNC_HALF_MM 5 %macro WEIGHT_FUNC_HALF_MM 3
cglobal h264_weight_%1x%2_%5, 5, 5, %4 cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP WEIGHT_SETUP
mov r2, %2/2 sar r2d, 1
lea r3, [r1*2] lea r3, [r1*2]
%if %2 == mmsize
.nextrow .nextrow
WEIGHT_OP 0, r1 WEIGHT_OP 0, r1
movh [r0], m0 movh [r0], m0
@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
movh [r0+r1], m0 movh [r0+r1], m0
%endif %endif
add r0, r3 add r0, r3
dec r2 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
%endif
%endmacro %endmacro
INIT_MMX INIT_MMX
WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 WEIGHT_FUNC_HALF_MM 4, 0, mmx2
INIT_XMM INIT_XMM
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 WEIGHT_FUNC_HALF_MM 8, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 WEIGHT_FUNC_HALF_MM 8, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0 %macro BIWEIGHT_SETUP 0
add r6, 1 %ifdef ARCH_X86_64
or r6, 1 %define off_regd r11d
add r3, 1 %else
movd m3, r4d %define off_regd r3d
movd m4, r5d %endif
movd m5, r6d mov off_regd, r7m
movd m6, r3d add off_regd, 1
or off_regd, 1
add r4, 1
movd m3, r5d
movd m4, r6d
movd m5, off_regd
movd m6, r4d
pslld m5, m6 pslld m5, m6
psrld m5, 1 psrld m5, 1
%if mmsize == 16 %if mmsize == 16
@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m1 packuswb m0, m1
%endmacro %endmacro
%macro BIWEIGHT_FUNC_DBL_MM 1 INIT_MMX
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 cglobal h264_biweight_16_mmx2, 7, 7, 0
BIWEIGHT_SETUP BIWEIGHT_SETUP
mov r3, %1 movifnidn r3d, r3m
%if %1 == 16
.nextrow .nextrow
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4 BIWEIGHT_STEPA 1, 2, 4
@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
mova [r0+8], m0 mova [r0+8], m0
add r0, r2 add r0, r2
add r1, r2 add r1, r2
dec r3 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
%endif
%endmacro
INIT_MMX %macro BIWEIGHT_FUNC_MM 3
BIWEIGHT_FUNC_DBL_MM 16 cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_FUNC_DBL_MM 8
%macro BIWEIGHT_FUNC_MM 4
cglobal h264_biweight_%1x%2_%4, 7, 7, %3
BIWEIGHT_SETUP BIWEIGHT_SETUP
mov r3, %2 movifnidn r3d, r3m
%if %2 == 16
.nextrow .nextrow
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2 BIWEIGHT_STEPA 1, 2, mmsize/2
@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
mova [r0], m0 mova [r0], m0
add r0, r2 add r0, r2
add r1, r2 add r1, r2
dec r3 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
%endif
%endmacro %endmacro
INIT_MMX INIT_MMX
BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 BIWEIGHT_FUNC_MM 8, 0, mmx2
BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
INIT_XMM INIT_XMM
BIWEIGHT_FUNC_MM 16, 16, 8, sse2 BIWEIGHT_FUNC_MM 16, 8, sse2
BIWEIGHT_FUNC_MM 16, 8, 8, sse2
%macro BIWEIGHT_FUNC_HALF_MM 5 %macro BIWEIGHT_FUNC_HALF_MM 3
cglobal h264_biweight_%1x%2_%5, 7, 7, %4 cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_SETUP BIWEIGHT_SETUP
mov r3, %2/2 movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2] lea r4, [r2*2]
%if %2 == mmsize
.nextrow .nextrow
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2 BIWEIGHT_STEPA 1, 2, r2
@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
%endif %endif
add r0, r4 add r0, r4
add r1, r4 add r1, r4
dec r3 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
%endif
%endmacro %endmacro
INIT_MMX INIT_MMX
BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
INIT_XMM INIT_XMM
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0 %macro BIWEIGHT_SSSE3_SETUP 0
add r6, 1 %ifdef ARCH_X86_64
or r6, 1 %define off_regd r11d
add r3, 1 %else
movd m4, r4d %define off_regd r3d
movd m0, r5d %endif
movd m5, r6d mov off_regd, r7m
movd m6, r3d add off_regd, 1
or off_regd, 1
add r4, 1
movd m4, r5d
movd m0, r6d
movd m5, off_regd
movd m6, r4d
pslld m5, m6 pslld m5, m6
psrld m5, 1 psrld m5, 1
punpcklbw m4, m0 punpcklbw m4, m0
@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m2 packuswb m0, m2
%endmacro %endmacro
%macro BIWEIGHT_SSSE3_16 1 INIT_XMM
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 cglobal h264_biweight_16_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP BIWEIGHT_SSSE3_SETUP
mov r3, %1 movifnidn r3d, r3m
%if %1 == 16
.nextrow .nextrow
movh m0, [r0] movh m0, [r0]
movh m2, [r0+8] movh m2, [r0+8]
@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
mova [r0], m0 mova [r0], m0
add r0, r2 add r0, r2
add r1, r2 add r1, r2
dec r3 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
%endif
%endmacro
INIT_XMM INIT_XMM
BIWEIGHT_SSSE3_16 16 cglobal h264_biweight_8_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_16 8
%macro BIWEIGHT_SSSE3_8 1
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP BIWEIGHT_SSSE3_SETUP
mov r3, %1/2 movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2] lea r4, [r2*2]
%if %1 == 16
.nextrow .nextrow
movh m0, [r0] movh m0, [r0]
movh m1, [r1] movh m1, [r1]
@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
movhps [r0+r2], m0 movhps [r0+r2], m0
add r0, r4 add r0, r4
add r1, r4 add r1, r4
dec r3 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
%else
jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
%endif
%endmacro
INIT_XMM
BIWEIGHT_SSSE3_8 16
BIWEIGHT_SSSE3_8 8
BIWEIGHT_SSSE3_8 4

@ -36,33 +36,26 @@ cextern pw_1
SECTION .text SECTION .text
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_weight(uint8_t *dst, int stride, int log2_denom, ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
; int weight, int offset); ; int weight, int offset);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32 %macro WEIGHT_PROLOGUE 0
DECLARE_REG_TMP 2
%else
DECLARE_REG_TMP 10
%endif
%macro WEIGHT_PROLOGUE 1
mov t0, %1
.prologue .prologue
PROLOGUE 0,5,8 PROLOGUE 0,6,8
movifnidn r0, r0mp movifnidn r0, r0mp
movifnidn r1d, r1m movifnidn r1d, r1m
movifnidn r3d, r3m
movifnidn r4d, r4m movifnidn r4d, r4m
movifnidn r5d, r5m
%endmacro %endmacro
%macro WEIGHT_SETUP 1 %macro WEIGHT_SETUP 1
mova m0, [pw_1] mova m0, [pw_1]
movd m2, r2m movd m2, r3m
pslld m0, m2 ; 1<<log2_denom pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0 SPLATW m0, m0
shl r4, 19 ; *8, move to upper half of dword shl r5, 19 ; *8, move to upper half of dword
lea r4, [r4+r3*2+0x10000] lea r5, [r5+r4*2+0x10000]
movd m3, r4d ; weight<<1 | 1+(offset<<(3)) movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0 pshufd m3, m3, 0
mova m4, [pw_pixel_max] mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1 paddw m2, [sq_1] ; log2_denom+1
@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
%endmacro %endmacro
%macro WEIGHT_FUNC_DBL 1 %macro WEIGHT_FUNC_DBL 1
cglobal h264_weight_16x16_10_%1 cglobal h264_weight_16_10_%1
WEIGHT_PROLOGUE 16 WEIGHT_PROLOGUE
WEIGHT_SETUP %1 WEIGHT_SETUP %1
.nextrow .nextrow
WEIGHT_OP %1, 0 WEIGHT_OP %1, 0
@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
WEIGHT_OP %1, 16 WEIGHT_OP %1, 16
mova [r0+16], m5 mova [r0+16], m5
add r0, r1 add r0, r1
dec t0 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_weight_16x8_10_%1
mov t0, 8
jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM
@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
%macro WEIGHT_FUNC_MM 1 %macro WEIGHT_FUNC_MM 1
cglobal h264_weight_8x16_10_%1 cglobal h264_weight_8_10_%1
WEIGHT_PROLOGUE 16 WEIGHT_PROLOGUE
WEIGHT_SETUP %1 WEIGHT_SETUP %1
.nextrow .nextrow
WEIGHT_OP %1, 0 WEIGHT_OP %1, 0
mova [r0], m5 mova [r0], m5
add r0, r1 add r0, r1
dec t0 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_weight_8x8_10_%1
mov t0, 8
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
cglobal h264_weight_8x4_10_%1
mov t0, 4
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM
@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
%macro WEIGHT_FUNC_HALF_MM 1 %macro WEIGHT_FUNC_HALF_MM 1
cglobal h264_weight_4x8_10_%1 cglobal h264_weight_4_10_%1
WEIGHT_PROLOGUE 4 WEIGHT_PROLOGUE
sar r2d, 1
WEIGHT_SETUP %1 WEIGHT_SETUP %1
lea r3, [r1*2] lea r3, [r1*2]
.nextrow .nextrow
@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
movh [r0], m5 movh [r0], m5
movhps [r0+r1], m5 movhps [r0+r1], m5
add r0, r3 add r0, r3
dec t0 dec r2d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_weight_4x4_10_%1
mov t0, 2
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
cglobal h264_weight_4x2_10_%1
mov t0, 1
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM
@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom, ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
; int weightd, int weights, int offset); ; int log2_denom, int weightd, int weights, int offset);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32 %ifdef ARCH_X86_32
DECLARE_REG_TMP 2,3 DECLARE_REG_TMP 3
%else %else
DECLARE_REG_TMP 10,2 DECLARE_REG_TMP 10
%endif %endif
%macro BIWEIGHT_PROLOGUE 1 %macro BIWEIGHT_PROLOGUE 0
mov t0, %1
.prologue .prologue
PROLOGUE 0,7,8 PROLOGUE 0,7,8
movifnidn r0, r0mp movifnidn r0, r0mp
movifnidn r1, r1mp movifnidn r1, r1mp
movifnidn t1d, r2m movifnidn r2d, r2m
movifnidn r4d, r4m
movifnidn r5d, r5m movifnidn r5d, r5m
movifnidn r6d, r6m movifnidn r6d, r6m
movifnidn t0d, r7m
%endmacro %endmacro
%macro BIWEIGHT_SETUP 1 %macro BIWEIGHT_SETUP 1
lea r6, [r6*4+1] ; (offset<<2)+1 lea t0, [t0*4+1] ; (offset<<2)+1
or r6, 1 or t0, 1
shl r5, 16 shl r6, 16
or r4, r5 or r5, r6
movd m4, r4d ; weightd | weights movd m4, r5d ; weightd | weights
movd m5, r6d ; (offset+1)|1 movd m5, t0d ; (offset+1)|1
movd m6, r3m ; log2_denom movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1] paddd m6, [sq_1]
pshufd m4, m4, 0 pshufd m4, m4, 0
pshufd m5, m5, 0 pshufd m5, m5, 0
mova m3, [pw_pixel_max] mova m3, [pw_pixel_max]
movifnidn r3d, r3m
%ifnidn %1, sse4 %ifnidn %1, sse4
pxor m7, m7 pxor m7, m7
%endif %endif
@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
%endmacro %endmacro
%macro BIWEIGHT_FUNC_DBL 1 %macro BIWEIGHT_FUNC_DBL 1
cglobal h264_biweight_16x16_10_%1 cglobal h264_biweight_16_10_%1
BIWEIGHT_PROLOGUE 16 BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
.nextrow .nextrow
BIWEIGHT %1, 0 BIWEIGHT %1, 0
mova [r0 ], m0 mova [r0 ], m0
BIWEIGHT %1, 16 BIWEIGHT %1, 16
mova [r0+16], m0 mova [r0+16], m0
add r0, t1 add r0, r2
add r1, t1 add r1, r2
dec t0 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_biweight_16x8_10_%1
mov t0, 8
jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM
@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
BIWEIGHT_FUNC_DBL sse4 BIWEIGHT_FUNC_DBL sse4
%macro BIWEIGHT_FUNC 1 %macro BIWEIGHT_FUNC 1
cglobal h264_biweight_8x16_10_%1 cglobal h264_biweight_8_10_%1
BIWEIGHT_PROLOGUE 16 BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
.nextrow .nextrow
BIWEIGHT %1, 0 BIWEIGHT %1, 0
mova [r0], m0 mova [r0], m0
add r0, t1 add r0, r2
add r1, t1 add r1, r2
dec t0 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_biweight_8x8_10_%1
mov t0, 8
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
cglobal h264_biweight_8x4_10_%1
mov t0, 4
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM
@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
BIWEIGHT_FUNC sse4 BIWEIGHT_FUNC sse4
%macro BIWEIGHT_FUNC_HALF 1 %macro BIWEIGHT_FUNC_HALF 1
cglobal h264_biweight_4x8_10_%1 cglobal h264_biweight_4_10_%1
BIWEIGHT_PROLOGUE 4 BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
lea r4, [t1*2] sar r3d, 1
lea r4, [r2*2]
.nextrow .nextrow
BIWEIGHT %1, 0, t1 BIWEIGHT %1, 0, r2
movh [r0 ], m0 movh [r0 ], m0
movhps [r0+t1], m0 movhps [r0+r2], m0
add r0, r4 add r0, r4
add r1, r4 add r1, r4
dec t0 dec r3d
jnz .nextrow jnz .nextrow
REP_RET REP_RET
cglobal h264_biweight_4x4_10_%1
mov t0, 2
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
cglobal h264_biweight_4x2_10_%1
mov t0, 1
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
%endmacro %endmacro
INIT_XMM INIT_XMM

@ -298,57 +298,47 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
/***********************************/ /***********************************/
/* weighted prediction */ /* weighted prediction */
#define H264_WEIGHT(W, H, OPT) \ #define H264_WEIGHT(W, OPT) \
void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
int stride, int log2_denom, int weight, int offset); int stride, int height, int log2_denom, int weight, int offset);
#define H264_BIWEIGHT(W, H, OPT) \ #define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
uint8_t *src, int stride, int log2_denom, int weightd, \ uint8_t *src, int stride, int height, int log2_denom, int weightd, \
int weights, int offset); int weights, int offset);
#define H264_BIWEIGHT_MMX(W,H) \ #define H264_BIWEIGHT_MMX(W) \
H264_WEIGHT (W, H, mmx2) \ H264_WEIGHT (W, mmx2) \
H264_BIWEIGHT(W, H, mmx2) H264_BIWEIGHT(W, mmx2)
#define H264_BIWEIGHT_MMX_SSE(W,H) \ #define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W, H) \ H264_BIWEIGHT_MMX(W) \
H264_WEIGHT (W, H, sse2) \ H264_WEIGHT (W, sse2) \
H264_BIWEIGHT (W, H, sse2) \ H264_BIWEIGHT (W, sse2) \
H264_BIWEIGHT (W, H, ssse3) H264_BIWEIGHT (W, ssse3)
H264_BIWEIGHT_MMX_SSE(16, 16) H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(16, 8) H264_BIWEIGHT_MMX_SSE( 8)
H264_BIWEIGHT_MMX_SSE( 8, 16) H264_BIWEIGHT_MMX ( 4)
H264_BIWEIGHT_MMX_SSE( 8, 8)
H264_BIWEIGHT_MMX_SSE( 8, 4) #define H264_WEIGHT_10(W, DEPTH, OPT) \
H264_BIWEIGHT_MMX ( 4, 8) void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
H264_BIWEIGHT_MMX ( 4, 4) int stride, int height, int log2_denom, int weight, int offset);
H264_BIWEIGHT_MMX ( 4, 2)
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
#define H264_WEIGHT_10(W, H, DEPTH, OPT) \ void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
int stride, int log2_denom, int weight, int offset);
#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
int weightd, int weights, int offset); int weightd, int weights, int offset);
#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \ #define H264_BIWEIGHT_10_SSE(W, DEPTH) \
H264_WEIGHT_10 (W, H, DEPTH, sse2) \ H264_WEIGHT_10 (W, DEPTH, sse2) \
H264_WEIGHT_10 (W, H, DEPTH, sse4) \ H264_WEIGHT_10 (W, DEPTH, sse4) \
H264_BIWEIGHT_10(W, H, DEPTH, sse2) \ H264_BIWEIGHT_10(W, DEPTH, sse2) \
H264_BIWEIGHT_10(W, H, DEPTH, sse4) H264_BIWEIGHT_10(W, DEPTH, sse4)
H264_BIWEIGHT_10_SSE(16, 16, 10) H264_BIWEIGHT_10_SSE(16, 10)
H264_BIWEIGHT_10_SSE(16, 8, 10) H264_BIWEIGHT_10_SSE( 8, 10)
H264_BIWEIGHT_10_SSE( 8, 16, 10) H264_BIWEIGHT_10_SSE( 4, 10)
H264_BIWEIGHT_10_SSE( 8, 8, 10)
H264_BIWEIGHT_10_SSE( 8, 4, 10)
H264_BIWEIGHT_10_SSE( 4, 8, 10)
H264_BIWEIGHT_10_SSE( 4, 4, 10)
H264_BIWEIGHT_10_SSE( 4, 2, 10)
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
{ {
@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif #endif
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
if (mm_flags&AV_CPU_FLAG_SSE2) { if (mm_flags&AV_CPU_FLAG_SSE2) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2; c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
#if HAVE_ALIGNED_STACK #if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
#endif #endif
} }
if (mm_flags&AV_CPU_FLAG_SSSE3) { if (mm_flags&AV_CPU_FLAG_SSSE3) {
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
} }
if (mm_flags&AV_CPU_FLAG_AVX) { if (mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK #if HAVE_ALIGNED_STACK
@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif #endif
c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
#endif #endif
} }
if (mm_flags&AV_CPU_FLAG_SSE4) { if (mm_flags&AV_CPU_FLAG_SSE4) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
} }
#if HAVE_AVX #if HAVE_AVX
if (mm_flags&AV_CPU_FLAG_AVX) { if (mm_flags&AV_CPU_FLAG_AVX) {

Loading…
Cancel
Save