diff --git a/Changelog b/Changelog index 1b59a1459e..6d47eaf01b 100644 --- a/Changelog +++ b/Changelog @@ -67,6 +67,7 @@ easier to use. The changes are: - aevalsrc audio source added - Ut Video decoder - Speex encoding via libspeex +- 4:2:2 H.264 decoding support version 0.8: diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c index e51026800d..cc4c688c8b 100644 --- a/libavcodec/arm/h264dsp_init_arm.c +++ b/libavcodec/arm/h264dsp_init_arm.c @@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); +void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, + int log2_den, int weight, int offset); -void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, + int height, int log2_den, int weightd, + int weights, int offset); void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); @@ -101,23 +76,14 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; } - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; - c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; - c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; - c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; - c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; - c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; - c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; c->h264_idct_add = ff_h264_idct_add_neon; c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index 338de6f643..6426f46637 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -1592,7 +1592,7 @@ endfunc vdup.8 d1, r5 vmov q2, q8 vmov q3, q8 -1: subs ip, ip, #2 +1: subs r3, r3, #2 vld1.8 {d20-d21},[r0,:128], r2 \macd q2, d0, d20 pld [r0] @@ -1632,7 +1632,7 @@ endfunc vdup.8 d1, r5 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #2 +1: subs r3, r3, #2 vld1.8 {d4},[r0,:64], r2 \macd q1, d0, d4 pld [r0] @@ -1662,7 +1662,7 @@ endfunc vdup.8 d1, r5 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #4 +1: subs r3, r3, #4 vld1.32 {d4[0]},[r0,:32], r2 vld1.32 {d4[1]},[r0,:32], r2 \macd q1, d0, d4 @@ -1700,16 +1700,17 @@ endfunc .endm .macro biweight_func w -function biweight_h264_pixels_\w\()_neon +function ff_biweight_h264_pixels_\w\()_neon, export=1 push {r4-r6, lr} - add r4, sp, #16 + ldr r12, [sp, #16] + add r4, sp, #20 ldm r4, {r4-r6} lsr lr, r4, #31 add r6, r6, #1 eors lr, lr, r5, lsr #30 orr r6, r6, #1 - vdup.16 q9, r3 - lsl r6, r6, r3 + vdup.16 q9, r12 + lsl r6, r6, r12 vmvn q9, q9 vdup.16 q8, r6 mov r6, r0 @@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon endfunc .endm - .macro biweight_entry w, h, b=1 -function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b biweight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - biweight_entry 16, 8 - biweight_entry 16, 16, b=0 biweight_func 16 - - biweight_entry 8, 16 - biweight_entry 8, 4 - biweight_entry 8, 8, b=0 biweight_func 8 - - biweight_entry 4, 8 - biweight_entry 4, 2 - biweight_entry 4, 4, b=0 biweight_func 4 @ Weighted prediction .macro weight_16 add - vdup.8 d0, r3 -1: subs ip, ip, #2 + vdup.8 d0, r12 +1: subs r2, r2, #2 vld1.8 {d20-d21},[r0,:128], r1 vmull.u8 q2, d0, d20 pld [r0] @@ -1785,8 +1767,8 @@ endfunc .endm .macro weight_8 add - vdup.8 d0, r3 -1: subs ip, ip, #2 + vdup.8 d0, r12 +1: subs r2, r2, #2 vld1.8 {d4},[r0,:64], r1 vmull.u8 q1, d0, d4 pld [r0] @@ -1806,10 +1788,10 @@ endfunc .endm .macro weight_4 add - vdup.8 d0, r3 + vdup.8 d0, r12 vmov q1, q8 vmov q10, q8 -1: subs ip, ip, #4 +1: subs r2, r2, #4 vld1.32 {d4[0]},[r0,:32], r1 vld1.32 {d4[1]},[r0,:32], r1 vmull.u8 q1, d0, d4 @@ -1842,50 +1824,32 @@ endfunc .endm .macro weight_func w -function weight_h264_pixels_\w\()_neon +function ff_weight_h264_pixels_\w\()_neon, export=1 push {r4, lr} - ldr r4, [sp, #8] - cmp r2, #1 - lsl r4, r4, r2 + ldr r12, [sp, #8] + ldr r4, [sp, #12] + cmp r3, #1 + lsl r4, r4, r3 vdup.16 q8, r4 mov r4, r0 ble 20f - rsb lr, r2, #1 + rsb lr, r3, #1 vdup.16 q9, lr - cmp r3, #0 + cmp r12, #0 blt 10f weight_\w vhadd.s16 -10: rsb r3, r3, #0 +10: rsb r12, r12, #0 weight_\w vhsub.s16 -20: rsb lr, r2, #0 +20: rsb lr, r3, #0 vdup.16 q9, lr - cmp r3, #0 + cmp r12, #0 blt 10f weight_\w vadd.s16 -10: rsb r3, r3, #0 +10: rsb r12, r12, #0 weight_\w vsub.s16 endfunc .endm - .macro weight_entry w, h, b=1 -function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b weight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - weight_entry 16, 8 - weight_entry 16, 16, b=0 weight_func 16 - - weight_entry 8, 16 - weight_entry 8, 4 - weight_entry 8, 8, b=0 weight_func 8 - - weight_entry 4, 8 - weight_entry 4, 2 - weight_entry 4, 4, b=0 weight_func 4 diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h index 825422bed6..c0584753cd 100644 --- a/libavcodec/fmtconvert.h +++ b/libavcodec/fmtconvert.h @@ -70,7 +70,15 @@ typedef struct FmtConvertContext { long len, int channels); /** - * Convert an array of interleaved float to multiple arrays of float. + * Convert multiple arrays of float to an array of interleaved float. + * + * @param dst destination array of interleaved float. + * constraints: 16-byte aligned + * @param src source array of float arrays, one for each channel. + * constraints: 16-byte aligned + * @param len number of elements to convert. + * constraints: multiple of 8 + * @param channels number of channels */ void (*float_interleave)(float *dst, const float **src, unsigned int len, int channels); diff --git a/libavcodec/h264.c b/libavcodec/h264.c index addebce07d..4906f92ea8 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -460,11 +460,14 @@ static void chroma_dc_dct_c(DCTELEM *block){ } #endif -static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int src_x_offset, int src_y_offset, - qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, - int pixel_shift, int chroma444){ +static av_always_inline void +mc_dir_part(H264Context *h, Picture *pic, int n, int square, + int height, int delta, int list, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int src_x_offset, int src_y_offset, + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, + int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; @@ -479,6 +482,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, const int full_my= my>>2; const int pic_width = 16*s->mb_width; const int pic_height = 16*s->mb_height >> MB_FIELD; + int ysh; if(mx&7) extra_width -= 3; if(my&7) extra_height -= 3; @@ -487,7 +491,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, || full_my < 0-extra_height || full_mx + 16/*FIXME*/ > pic_width + extra_width || full_my + 16/*FIXME*/ > pic_height + extra_height){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, + 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize; emu=1; } @@ -499,7 +504,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return; - if(chroma444){ + if(chroma_idc == 3 /* yuv444 */){ src_cb = pic->f.data[1] + offset; if(emu){ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, @@ -524,42 +529,55 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, return; } - if(MB_FIELD){ + ysh = 3 - (chroma_idc == 2 /* yuv422 */); + if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){ // chroma offset when predicting from a field of opposite parity my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1)); emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1); } - src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize; - src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize; + + src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize; + src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize; if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, + 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), + pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); src_cb= s->edge_emu_buffer; } - chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7); + chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), + mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7); if(emu){ - s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); + s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, + 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), + pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); src_cr= s->edge_emu_buffer; } - chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7); + chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */), + mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7); } -static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - int list0, int list1, int pixel_shift, int chroma444){ +static av_always_inline void +mc_part_std(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + int list0, int list1, int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; qpel_mc_func *qpix_op= qpix_put; h264_chroma_mc_func chroma_op= chroma_put; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - }else{ + } else if (chroma_idc == 2 /* yuv422 */) { + dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + } else /* yuv420 */ { dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; } @@ -568,9 +586,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei if(list0){ Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 0, + mc_dir_part(h, ref, n, square, height, delta, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op, pixel_shift, chroma444); + qpix_op, chroma_op, pixel_shift, chroma_idc); qpix_op= qpix_avg; chroma_op= chroma_avg; @@ -578,28 +596,36 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei if(list1){ Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; - mc_dir_part(h, ref, n, square, chroma_height, delta, 1, + mc_dir_part(h, ref, n, square, height, delta, 1, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_op, chroma_op, pixel_shift, chroma444); + qpix_op, chroma_op, pixel_shift, chroma_idc); } } -static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, - h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, - int list0, int list1, int pixel_shift, int chroma444){ +static av_always_inline void +mc_part_weighted(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, + h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, + int list0, int list1, int pixel_shift, int chroma_idc){ MpegEncContext * const s = &h->s; + int chroma_height; dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { + chroma_height = height; chroma_weight_avg = luma_weight_avg; chroma_weight_op = luma_weight_op; dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; - }else{ + } else if (chroma_idc == 2 /* yuv422 */) { + chroma_height = height; + dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; + } else /* yuv420 */ { + chroma_height = height >> 1; dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; } @@ -615,27 +641,32 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom int refn0 = h->ref_cache[0][ scan8[n] ]; int refn1 = h->ref_cache[1][ scan8[n] ]; - mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, + mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); - mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, + x_offset, y_offset, qpix_put, chroma_put, + pixel_shift, chroma_idc); + mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, tmp_y, tmp_cb, tmp_cr, - x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); + x_offset, y_offset, qpix_put, chroma_put, + pixel_shift, chroma_idc); if(h->use_weight == 2){ int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; int weight1 = 64 - weight0; - luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); + luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, + height, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, + chroma_height, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, + chroma_height, 5, weight0, weight1, 0); }else{ - luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, + luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom, h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); - chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); - chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); } @@ -643,42 +674,46 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom int list = list1 ? 1 : 0; int refn = h->ref_cache[list][ scan8[n] ]; Picture *ref= &h->ref_list[list][refn]; - mc_dir_part(h, ref, n, square, chroma_height, delta, list, + mc_dir_part(h, ref, n, square, height, delta, list, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put, chroma_put, pixel_shift, chroma444); + qpix_put, chroma_put, pixel_shift, chroma_idc); - luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, + luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); if(h->use_weight_chroma){ - chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); - chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, + chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); } } } -static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int list0, int list1, int pixel_shift, int chroma444){ +static av_always_inline void +mc_part(H264Context *h, int n, int square, int height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int list0, int list1, int pixel_shift, int chroma_idc) +{ if((h->use_weight==2 && list0 && list1 && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) || h->use_weight==1) - mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, - weight_op[0], weight_op[3], weight_avg[0], - weight_avg[3], list0, list1, pixel_shift, chroma444); + weight_op[0], weight_op[1], weight_avg[0], + weight_avg[1], list0, list1, pixel_shift, chroma_idc); else - mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put, qpix_avg, - chroma_avg, list0, list1, pixel_shift, chroma444); + chroma_avg, list0, list1, pixel_shift, chroma_idc); } -static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){ +static av_always_inline void +prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc) +{ /* fetch pixels for estimated mv 4 macroblocks ahead * optimized for 64byte cache lines */ MpegEncContext * const s = &h->s; @@ -689,7 +724,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in uint8_t **src = h->ref_list[list][refn].f.data; int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift); s->dsp.prefetch(src[0]+off, s->linesize, 4); - if(chroma444){ + if (chroma_idc == 3 /* yuv444 */) { s->dsp.prefetch(src[1]+off, s->linesize, 4); s->dsp.prefetch(src[2]+off, s->linesize, 4); }else{ @@ -703,7 +738,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int pixel_shift, int chroma444){ + int pixel_shift, int chroma_idc) +{ MpegEncContext * const s = &h->s; const int mb_xy= h->mb_xy; const int mb_type = s->current_picture.f.mb_type[mb_xy]; @@ -712,36 +748,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME)) await_references(h); - prefetch_motion(h, 0, pixel_shift, chroma444); + prefetch_motion(h, 0, pixel_shift, chroma_idc); if(IS_16X16(mb_type)){ - mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_16X8(mb_type)){ - mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], + weight_op, weight_avg, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); - mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, + pixel_shift, chroma_idc); + mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], + weight_op, weight_avg, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_8X16(mb_type)){ - mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, + mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], + &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), - pixel_shift, chroma444); - mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, + pixel_shift, chroma_idc); + mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], + &weight_op[1], &weight_avg[1], IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else{ int i; @@ -754,50 +790,72 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t int y_offset= (i&2)<<1; if(IS_SUB_8X8(sub_mb_type)){ - mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[3], &weight_avg[3], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_SUB_8X4(sub_mb_type)){ - mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); - mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, + pixel_shift, chroma_idc); + mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], + &weight_op[1], &weight_avg[1], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else if(IS_SUB_4X8(sub_mb_type)){ - mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, + mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); - mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, + pixel_shift, chroma_idc); + mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); }else{ int j; assert(IS_SUB_4X4(sub_mb_type)); for(j=0; j<4; j++){ int sub_x_offset= x_offset + 2*(j&1); int sub_y_offset= y_offset + (j&2); - mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, + mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[6], &weight_avg[6], + &weight_op[2], &weight_avg[2], IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), - pixel_shift, chroma444); + pixel_shift, chroma_idc); } } } } - prefetch_motion(h, 1, pixel_shift, chroma444); + prefetch_motion(h, 1, pixel_shift, chroma_idc); +} + +static av_always_inline void +hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int pixel_shift) +{ + hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, + qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1); +} + +static av_always_inline void +hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int pixel_shift) +{ + hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, + qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2); } static void free_tables(H264Context *h, int free_rbsp){ @@ -1468,7 +1526,10 @@ static void decode_postinit(H264Context *h, int setup_finished){ ff_thread_finish_setup(s->avctx); } -static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ +static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, + uint8_t *src_cb, uint8_t *src_cr, + int linesize, int uvlinesize, int simple) +{ MpegEncContext * const s = &h->s; uint8_t *top_border; int top_idx = 1; @@ -1813,7 +1874,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, } } -static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){ +static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift) +{ MpegEncContext * const s = &h->s; const int mb_x= s->mb_x; const int mb_y= s->mb_y; @@ -1827,7 +1889,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i /* is_h264 should always be true if SVQ3 is disabled. */ const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264; void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); - const int block_h = 16>>s->chroma_y_shift; + const int block_h = 16 >> s->chroma_y_shift; + const int chroma422 = CHROMA422; dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16; dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h; @@ -1844,8 +1907,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i block_offset = &h->block_offset[48]; if(mb_y&1){ //FIXME move out of this function? dest_y -= s->linesize*15; - dest_cb-= s->uvlinesize*(block_h-1); - dest_cr-= s->uvlinesize*(block_h-1); + dest_cb-= s->uvlinesize * (block_h - 1); + dest_cr-= s->uvlinesize * (block_h - 1); } if(FRAME_MBAFF) { int list; @@ -1884,7 +1947,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i } if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if (!h->sps.chroma_format_idc) { - for (i = 0; i < 8; i++) { + for (i = 0; i < block_h; i++) { uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize); uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize); for (j = 0; j < 8; j++) { @@ -1911,13 +1974,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if (!h->sps.chroma_format_idc) { for (i=0; i<8; i++) { - memset(dest_cb+ i*uvlinesize, 1 << (bit_depth - 1), 8); - memset(dest_cr+ i*uvlinesize, 1 << (bit_depth - 1), 8); + memset(dest_cb + i*uvlinesize, 1 << (bit_depth - 1), 8); + memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8); } } else { for (i=0; imb + 128 + i*4, 8); - memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4, 8); + memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4, 8); + memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4, 8); } } } @@ -1937,11 +2000,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift); }else if(is_h264){ - hl_motion(h, dest_y, dest_cb, dest_cr, - s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, - s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, - h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0); + if (chroma422) { + hl_motion_422(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, + pixel_shift); + } else { + hl_motion_420(h, dest_y, dest_cb, dest_cr, + s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, + s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, + h->h264dsp.weight_h264_pixels_tab, + h->h264dsp.biweight_h264_pixels_tab, + pixel_shift); + } } hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0); @@ -1959,14 +2032,20 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) idct_add (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize); } + if (chroma422) { + for(i=j*16+4; inon_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16)) + idct_add (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize); + } + } } } }else{ if(is_h264){ int qp[2]; - if (CHROMA422) { - qp[0] = h->chroma_qp[0]+3; - qp[1] = h->chroma_qp[1]+3; + if (chroma422) { + qp[0] = h->chroma_qp[0] + 3; + qp[1] = h->chroma_qp[1] + 3; } else { qp[0] = h->chroma_qp[0]; qp[1] = h->chroma_qp[1]; @@ -2086,7 +2165,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab, s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab, h->h264dsp.weight_h264_pixels_tab, - h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1); + h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3); } for (p = 0; p < plane_count; p++) @@ -2690,6 +2769,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ case 9 : if (CHROMA444) s->avctx->pix_fmt = PIX_FMT_YUV444P9; + else if (CHROMA422) + s->avctx->pix_fmt = PIX_FMT_YUV422P9; else s->avctx->pix_fmt = PIX_FMT_YUV420P9; break; @@ -2708,7 +2789,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){ s->avctx->pix_fmt = PIX_FMT_GBR24P; av_log(h->s.avctx, AV_LOG_DEBUG, "Detected GBR colorspace.\n"); } - }else if (CHROMA422) { + } else if (CHROMA422) { s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P; }else{ s->avctx->pix_fmt = s->avctx->get_format(s->avctx, @@ -3384,7 +3465,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){ const int end_mb_y= s->mb_y + FRAME_MBAFF; const int old_slice_type= h->slice_type; const int pixel_shift = h->pixel_shift; - const int block_h = 16>>s->chroma_y_shift; + const int block_h = 16 >> s->chroma_y_shift; if(h->deblocking_filter) { for(mb_x= start_x; mb_xmb_x= mb_x; s->mb_y= mb_y; dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16; - dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*(8<uvlinesize * block_h; - dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*(8<uvlinesize * block_h; + dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h; + dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h; //FIXME simplify above if (MB_FIELD) { @@ -3410,8 +3491,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){ uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; if(mb_y&1){ //FIXME move out of this function? dest_y -= s->linesize*15; - dest_cb-= s->uvlinesize*(block_h-1); - dest_cr-= s->uvlinesize*(block_h-1); + dest_cb-= s->uvlinesize * (block_h - 1); + dest_cr-= s->uvlinesize * (block_h - 1); } } else { linesize = h->mb_linesize = s->linesize; diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c index f4cae4621f..31c2658a6b 100644 --- a/libavcodec/h264_cabac.c +++ b/libavcodec/h264_cabac.c @@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; -static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { +static av_always_inline void +decode_cabac_residual_internal(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + const uint32_t *qmul, int max_coeff, + int is_dc, int chroma422) +{ static const int significant_coeff_flag_offset[2][14] = { { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 }, { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 } @@ -1593,7 +1598,10 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT * map node ctx => cabac ctx for level=1 */ static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ - static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; + static const uint8_t coeff_abs_levelgt1_ctx[2][8] = { + { 5, 5, 5, 5, 6, 7, 8, 9 }, + { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case + }; static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after decoding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, @@ -1652,7 +1660,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, last_coeff_ctx_base, sig_off); } else { - if (is_dc && max_coeff == 8) { // dc 422 + if (is_dc && chroma422) { // dc 422 DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); } else { coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index, @@ -1661,7 +1669,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT #else DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); } else { - if (is_dc && max_coeff == 8) { // dc 422 + if (is_dc && chroma422) { // dc 422 DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]); } else { DECODE_SIGNIFICANCE(max_coeff - 1, last, last); @@ -1701,9 +1709,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT } \ } else { \ int coeff_abs = 2; \ - if (is_dc && max_coeff == 8) \ - node_ctx = FFMIN(node_ctx, 6); \ - ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \ + ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \ node_ctx = coeff_abs_level_transition[1][node_ctx]; \ \ while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \ @@ -1745,11 +1751,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT } static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { - decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1); + decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0); +} + +static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + int max_coeff) +{ + decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1); } static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { - decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0); + decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0); } /* cat: 0-> DC 16x16 n = 0 @@ -1773,6 +1786,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM * decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff ); } +static av_always_inline void +decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block, + int cat, int n, const uint8_t *scantable, + int max_coeff) +{ + /* read coded block flag */ + if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) { + h->non_zero_count_cache[scan8[n]] = 0; + return; + } + decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff); +} + static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { /* read coded block flag */ if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) { @@ -2325,17 +2351,14 @@ decode_intra_mb: if(CHROMA444){ decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1); decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2); - } else { - const int num_c8x8 = h->sps.chroma_format_idc; - + } else if (CHROMA422) { if( cbp&0x30 ){ int c; for( c = 0; c < 2; c++ ) { //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); - decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, - CHROMA_DC_BLOCK_INDEX+c, - CHROMA422 ? chroma422_dc_scan : chroma_dc_scan, - 4*num_c8x8); + decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, + CHROMA_DC_BLOCK_INDEX + c, + chroma422_dc_scan, 8); } } @@ -2344,7 +2367,7 @@ decode_intra_mb: for( c = 0; c < 2; c++ ) { DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift); qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]]; - for (i8x8 = 0; i8x8 < num_c8x8; i8x8++) { + for (i8x8 = 0; i8x8 < 2; i8x8++) { for (i = 0; i < 4; i++) { const int index = 16 + 16 * c + 8*i8x8 + i; //av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16); @@ -2357,6 +2380,29 @@ decode_intra_mb: fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1); fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1); } + } else /* yuv420 */ { + if( cbp&0x30 ){ + int c; + for( c = 0; c < 2; c++ ) { + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); + decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4); + } + } + + if( cbp&0x20 ) { + int c, i; + for( c = 0; c < 2; c++ ) { + qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]]; + for( i = 0; i < 4; i++ ) { + const int index = 16 + 16 * c + i; + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 ); + decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15); + } + } + } else { + fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1); + } } } else { fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1); diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index 93697a83c1..a7de122c53 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -415,7 +415,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ #endif sps->crop= get_bits1(&s->gb); if(sps->crop){ - int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8; + int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8; int crop_horizontal_limit = sps->chroma_format_idc == 3 ? 16 : 8; sps->crop_left = get_ue_golomb(&s->gb); sps->crop_right = get_ue_golomb(&s->gb); diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index cf0067b8f0..bd35aa3065 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo else\ c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ \ - c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ - c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ - c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ - c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ - c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ - c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ - c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ - c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ - c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\ - c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\ - c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\ - c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\ - c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\ - c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\ - c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\ - c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\ - c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\ - c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\ - c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\ - c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\ + c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\ + c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\ + c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\ + c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\ + c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\ + c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\ + c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\ + c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\ \ c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index c79ab0a625..490a936310 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -31,16 +31,18 @@ #include "dsputil.h" //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); -typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); -typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); +typedef void (*h264_weight_func)(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset); +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height, + int log2_denom, int weightd, int weights, int offset); /** * Context for storing H.264 DSP functions */ typedef struct H264DSPContext{ /* weighted MC */ - h264_weight_func weight_h264_pixels_tab[10]; - h264_biweight_func biweight_h264_pixels_tab[10]; + h264_weight_func weight_h264_pixels_tab[4]; + h264_biweight_func biweight_h264_pixels_tab[4]; /* loop filter */ void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c index 3023541634..4d5faf01c0 100644 --- a/libavcodec/h264dsp_template.c +++ b/libavcodec/h264dsp_template.c @@ -29,14 +29,16 @@ #define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) #define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) -#define H264_WEIGHT(W,H) \ -static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int stride, int log2_denom, int weight, int offset){ \ +#define H264_WEIGHT(W) \ +static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \ + int log2_denom, int weight, int offset) \ +{ \ int y; \ - pixel *block = (pixel*)p_block; \ + pixel *block = (pixel*)_block; \ stride >>= sizeof(pixel)-1; \ offset <<= (log2_denom + (BIT_DEPTH-8)); \ if(log2_denom) offset += 1<<(log2_denom-1); \ - for(y=0; y>= sizeof(pixel)-1; \ offset <<= (BIT_DEPTH-8); \ offset = ((offset + 1) | 1) << log2_denom; \ - for(y=0; y> 8; block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8; } - -#if 0 - av_log(NULL, AV_LOG_INFO, "after chroma dc\n"); - for (int i = 0; i < 256; i++) { - av_log(NULL, AV_LOG_INFO, "%5d ", block[i]); - if (!((i+1) % 16)) - av_log(NULL, AV_LOG_INFO, "\n"); - } -#endif } -void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *p_block, int qmul){ +void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){ const int stride= 16*2; const int xStride= 16; int a,b,c,d,e; - dctcoef *block = (dctcoef*)p_block; + dctcoef *block = (dctcoef*)_block; a= block[stride*0 + xStride*0]; b= block[stride*0 + xStride*1]; diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c index dce29f9f0b..a174b4ca3c 100644 --- a/libavcodec/h264pred.c +++ b/libavcodec/h264pred.c @@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x16_dc , depth);\ h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc , depth);\ h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc , depth);\ - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\ }\ }else{\ h->pred8x8[DC_PRED8x8 ]= FUNCD(pred8x8_dc_rv40);\ @@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co h->pred4x4_add [ HOR_PRED ]= FUNCC(pred4x4_horizontal_add , depth);\ h->pred8x8l_add [VERT_PRED ]= FUNCC(pred8x8l_vertical_add , depth);\ h->pred8x8l_add [ HOR_PRED ]= FUNCC(pred8x8l_horizontal_add , depth);\ + if (chroma_format_idc == 1) {\ h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add , depth);\ h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add , depth);\ + } else {\ + h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add , depth);\ + h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add , depth);\ + }\ h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add , depth);\ h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add , depth);\ diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c index 8021895378..074fad50ca 100644 --- a/libavcodec/h264pred_template.c +++ b/libavcodec/h264pred_template.c @@ -663,23 +663,45 @@ static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){ FUNCC(pred4x4_dc)(src, NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){ + FUNCC(pred8x16_top_dc)(src, stride); + FUNCC(pred4x4_dc)(src, NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){ FUNCC(pred8x8_dc)(src, stride); FUNCC(pred4x4_top_dc)(src, NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){ + FUNCC(pred8x16_dc)(src, stride); + FUNCC(pred4x4_top_dc)(src, NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){ FUNCC(pred8x8_left_dc)(src, stride); FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); +} + static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){ FUNCC(pred8x8_left_dc)(src, stride); FUNCC(pred4x4_128_dc)(src , NULL, stride); FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); } +static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); +} + static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){ int j, k; int a; @@ -1126,8 +1148,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } +static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} + static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ int i; for(i=0; i<4; i++) FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); } + +static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c index 7ee53b04e5..91f190525d 100644 --- a/libavcodec/libspeexdec.c +++ b/libavcodec/libspeexdec.c @@ -18,11 +18,11 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "avcodec.h" #include #include #include #include +#include "avcodec.h" typedef struct { SpeexBits bits; @@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx) mode = speex_lib_get_mode(s->header->mode); if (!mode) { av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode); - return -1; + return AVERROR_INVALIDDATA; } } else av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n"); if (avctx->channels > 2) { av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n"); - return -1; + return AVERROR(EINVAL); } speex_bits_init(&s->bits); @@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx, uint8_t *buf = avpkt->data; int buf_size = avpkt->size; LibSpeexContext *s = avctx->priv_data; - int16_t *output = data, *end; - int i, num_samples; - - num_samples = s->frame_size * avctx->channels; - end = output + *data_size / sizeof(*output); - - speex_bits_read_from(&s->bits, buf, buf_size); - - for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) { - int ret = speex_decode_int(s->dec_state, &s->bits, output); - if (ret <= -2) { - av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); - return -1; - } else if (ret == -1) - // end of stream - break; + int16_t *output = data; + int out_size, ret, consumed = 0; + + /* check output buffer size */ + out_size = s->frame_size * avctx->channels * + av_get_bytes_per_sample(avctx->sample_fmt); + if (*data_size < out_size) { + av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n"); + return AVERROR(EINVAL); + } - if (avctx->channels == 2) - speex_decode_stereo_int(output, s->frame_size, &s->stereo); + /* if there is not enough data left for the smallest possible frame, + reset the libspeex buffer using the current packet, otherwise ignore + the current packet and keep decoding frames from the libspeex buffer. */ + if (speex_bits_remaining(&s->bits) < 43) { + /* check for flush packet */ + if (!buf || !buf_size) { + *data_size = 0; + return buf_size; + } + /* set new buffer */ + speex_bits_read_from(&s->bits, buf, buf_size); + consumed = buf_size; + } - output += num_samples; + /* decode a single frame */ + ret = speex_decode_int(s->dec_state, &s->bits, output); + if (ret <= -2) { + av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n"); + return AVERROR_INVALIDDATA; } + if (avctx->channels == 2) + speex_decode_stereo_int(output, s->frame_size, &s->stereo); - avctx->frame_size = s->frame_size * i; - *data_size = avctx->channels * avctx->frame_size * sizeof(*output); - return buf_size; + *data_size = out_size; + return consumed; } static av_cold int libspeex_decode_close(AVCodecContext *avctx) @@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx) return 0; } +static av_cold void libspeex_decode_flush(AVCodecContext *avctx) +{ + LibSpeexContext *s = avctx->priv_data; + speex_bits_reset(&s->bits); +} + AVCodec ff_libspeex_decoder = { .name = "libspeex", .type = AVMEDIA_TYPE_AUDIO, @@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = { .init = libspeex_decode_init, .close = libspeex_decode_close, .decode = libspeex_decode_frame, + .flush = libspeex_decode_flush, + .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY, .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"), }; diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index b5ba285bc9..f5f169a8e3 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -1893,24 +1893,50 @@ typedef struct MP3On4DecodeContext { int syncword; ///< syncword patch const uint8_t *coff; ///< channels offsets in output buffer MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance + OUT_INT *decoded_buf; ///< output buffer for decoded samples } MP3On4DecodeContext; #include "mpeg4audio.h" /* Next 3 arrays are indexed by channel config number (passed via codecdata) */ static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5}; /* number of mp3 decoder instances */ -/* offsets into output buffer, assume output order is FL FR BL BR C LFE */ +/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */ static const uint8_t chan_offset[8][5] = { {0}, {0}, // C {0}, // FLR {2,0}, // C FLR {2,0,3}, // C FLR BS - {4,0,2}, // C FLR BLRS - {4,0,2,5}, // C FLR BLRS LFE - {4,0,2,6,5}, // C FLR BLRS BLR LFE + {2,0,3}, // C FLR BLRS + {2,0,4,3}, // C FLR BLRS LFE + {2,0,6,4,3}, // C FLR BLRS BLR LFE }; +/* mp3on4 channel layouts */ +static const int16_t chan_layout[8] = { + 0, + AV_CH_LAYOUT_MONO, + AV_CH_LAYOUT_STEREO, + AV_CH_LAYOUT_SURROUND, + AV_CH_LAYOUT_4POINT0, + AV_CH_LAYOUT_5POINT0, + AV_CH_LAYOUT_5POINT1, + AV_CH_LAYOUT_7POINT1 +}; + +static av_cold int decode_close_mp3on4(AVCodecContext * avctx) +{ + MP3On4DecodeContext *s = avctx->priv_data; + int i; + + for (i = 0; i < s->frames; i++) + av_free(s->mp3decctx[i]); + + av_freep(&s->decoded_buf); + + return 0; +} + static int decode_init_mp3on4(AVCodecContext * avctx) { @@ -1931,6 +1957,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx) s->frames = mp3Frames[cfg.chan_config]; s->coff = chan_offset[cfg.chan_config]; avctx->channels = ff_mpeg4audio_channels[cfg.chan_config]; + avctx->channel_layout = chan_layout[cfg.chan_config]; if (cfg.sample_rate < 16000) s->syncword = 0xffe00000; @@ -1944,6 +1971,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx) */ // Allocate zeroed memory for the first decoder context s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[0]) + goto alloc_fail; // Put decoder context in place to make init_decode() happy avctx->priv_data = s->mp3decctx[0]; decode_init(avctx); @@ -1956,23 +1985,38 @@ static int decode_init_mp3on4(AVCodecContext * avctx) */ for (i = 1; i < s->frames; i++) { s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[i]) + goto alloc_fail; s->mp3decctx[i]->adu_mode = 1; s->mp3decctx[i]->avctx = avctx; + s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp; + } + + /* Allocate buffer for multi-channel output if needed */ + if (s->frames > 1) { + s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS * + sizeof(*s->decoded_buf)); + if (!s->decoded_buf) + goto alloc_fail; } return 0; +alloc_fail: + decode_close_mp3on4(avctx); + return AVERROR(ENOMEM); } -static av_cold int decode_close_mp3on4(AVCodecContext * avctx) +static void flush_mp3on4(AVCodecContext *avctx) { - MP3On4DecodeContext *s = avctx->priv_data; int i; + MP3On4DecodeContext *s = avctx->priv_data; - for (i = 0; i < s->frames; i++) - av_free(s->mp3decctx[i]); - - return 0; + for (i = 0; i < s->frames; i++) { + MPADecodeContext *m = s->mp3decctx[i]; + memset(m->synth_buf, 0, sizeof(m->synth_buf)); + m->last_buf_size = 0; + } } @@ -1987,12 +2031,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, int fsize, len = buf_size, out_size = 0; uint32_t header; OUT_INT *out_samples = data; - OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS]; OUT_INT *outptr, *bp; - int fr, j, n; + int fr, j, n, ch; - if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT)) - return -1; + if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) { + av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n"); + return AVERROR(EINVAL); + } *data_size = 0; // Discard too short frames @@ -2000,10 +2045,11 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, return -1; // If only one decoder interleave is not needed - outptr = s->frames == 1 ? out_samples : decoded_buf; + outptr = s->frames == 1 ? out_samples : s->decoded_buf; avctx->bit_rate = 0; + ch = 0; for (fr = 0; fr < s->frames; fr++) { fsize = AV_RB16(buf) >> 4; fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE); @@ -2016,6 +2062,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, break; avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header); + + if (ch + m->nb_channels > avctx->channels) { + av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec " + "channel count\n"); + return AVERROR_INVALIDDATA; + } + ch += m->nb_channels; + out_size += mp_decode_frame(m, outptr, buf, fsize); buf += fsize; len -= fsize; @@ -2026,13 +2080,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx, bp = out_samples + s->coff[fr]; if(m->nb_channels == 1) { for(j = 0; j < n; j++) { - *bp = decoded_buf[j]; + *bp = s->decoded_buf[j]; bp += avctx->channels; } } else { for(j = 0; j < n; j++) { - bp[0] = decoded_buf[j++]; - bp[1] = decoded_buf[j]; + bp[0] = s->decoded_buf[j++]; + bp[1] = s->decoded_buf[j]; bp += avctx->channels; } } @@ -2110,7 +2164,7 @@ AVCodec ff_mp3on4_decoder = { .init = decode_init_mp3on4, .close = decode_close_mp3on4, .decode = decode_frame_mp3on4, - .flush = flush, + .flush = flush_mp3on4, .long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"), }; #endif diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c index 2fde46a5cd..312b84278f 100644 --- a/libavcodec/mpegaudiodec_float.c +++ b/libavcodec/mpegaudiodec_float.c @@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = { .init = decode_init_mp3on4, .close = decode_close_mp3on4, .decode = decode_frame_mp3on4, - .flush = flush, + .flush = flush_mp3on4, .long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"), }; #endif diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index 00fb0a73d9..bf4fd0f016 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, } static av_always_inline -void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) +void weight_h264_W_altivec(uint8_t *block, int stride, int height, + int log2_denom, int weight, int offset, int w) { int y, aligned; vec_u8 vblock; @@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei voffset = vec_splat(vtemp, 5); aligned = !((unsigned long)block & 0xf); - for (y=0; ybits_per_raw_sample > 8; @@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; } } } diff --git a/libavcodec/utils.c b/libavcodec/utils.c index 205e3600c5..405fc31318 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV420P10LE: case PIX_FMT_YUV420P10BE: + case PIX_FMT_YUV422P9LE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV422P10BE: case PIX_FMT_YUV444P9LE: diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2643b34d67..8c0380315e 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s) av_freep(&s->top_nnz); av_freep(&s->edge_emu_buffer); av_freep(&s->top_border); - av_freep(&s->segmentation_map); s->macroblocks = NULL; } -static void vp8_decode_flush(AVCodecContext *avctx) +static int vp8_alloc_frame(VP8Context *s, AVFrame *f) +{ + int ret; + if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0) + return ret; + if (!s->maps_are_invalid && s->num_maps_to_be_freed) { + f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed]; + } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) { + ff_thread_release_buffer(s->avctx, f); + return AVERROR(ENOMEM); + } + return 0; +} + +static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close) +{ + if (!is_close) { + if (f->ref_index[0]) { + assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps)); + s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0]; + f->ref_index[0] = NULL; + } + } else { + av_freep(&f->ref_index[0]); + } + ff_thread_release_buffer(s->avctx, f); +} + +static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close) { VP8Context *s = avctx->priv_data; int i; - if (!avctx->is_copy) { + if (!avctx->is_copy || force) { for (i = 0; i < 5; i++) if (s->frames[i].data[0]) - ff_thread_release_buffer(avctx, &s->frames[i]); + vp8_release_frame(s, &s->frames[i], is_close); } memset(s->framep, 0, sizeof(s->framep)); free_buffers(s); + s->maps_are_invalid = 1; +} + +static void vp8_decode_flush(AVCodecContext *avctx) +{ + vp8_decode_flush_impl(avctx, 0, 0); } static int update_dimensions(VP8Context *s, int width, int height) @@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height) if (av_image_check_size(width, height, 0, s->avctx)) return AVERROR_INVALIDDATA; - vp8_decode_flush(s->avctx); + vp8_decode_flush_impl(s->avctx, 1, 0); avcodec_set_dimensions(s->avctx, width, height); } @@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height) s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4); s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz)); s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border)); - s->segmentation_map = av_mallocz(s->mb_width*s->mb_height); if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top || - !s->top_nnz || !s->top_border || !s->segmentation_map) + !s->top_nnz || !s->top_border) return AVERROR(ENOMEM); s->macroblocks = s->macroblocks_base + 1; @@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y) } } +static void release_queued_segmaps(VP8Context *s, int is_close) +{ + int leave_behind = is_close ? 0 : !s->maps_are_invalid; + while (s->num_maps_to_be_freed > leave_behind) + av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]); + s->maps_are_invalid = 0; +} + static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { @@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT]; + release_queued_segmaps(s, 0); + if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0) return ret; @@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] && &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] && &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) - ff_thread_release_buffer(avctx, &s->frames[i]); + vp8_release_frame(s, &s->frames[i], 0); // find a free buffer for (i = 0; i < 5; i++) @@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, curframe->key_frame = s->keyframe; curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P; curframe->reference = referenced ? 3 : 0; - curframe->ref_index[0] = s->segmentation_map; - if ((ret = ff_thread_get_buffer(avctx, curframe))) { + if ((ret = vp8_alloc_frame(s, curframe))) { av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n"); return ret; } @@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4); s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2); - decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy, - prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL); + decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy, + prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL); prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); @@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx) static av_cold int vp8_decode_free(AVCodecContext *avctx) { - vp8_decode_flush(avctx); + vp8_decode_flush_impl(avctx, 0, 1); + release_queued_segmaps(avctx->priv_data, 1); return 0; } diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index 468e28e8d5..36c21df217 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -130,7 +130,6 @@ typedef struct { uint8_t *intra4x4_pred_mode_top; uint8_t intra4x4_pred_mode_left[4]; - uint8_t *segmentation_map; /** * Macroblocks can have one of 4 different quants in a frame when @@ -237,6 +236,16 @@ typedef struct { H264PredContext hpc; vp8_mc_func put_pixels_tab[3][3][3]; AVFrame frames[5]; + + /** + * A list of segmentation_map buffers that are to be free()'ed in + * the next decoding iteration. We can't free() them right away + * because the map may still be used by subsequent decoding threads. + * Unused if frame threading is off. + */ + uint8_t *segmentation_maps[5]; + int num_maps_to_be_freed; + int maps_are_invalid; } VP8Context; #endif /* AVCODEC_VP8_H */ diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 70f1b86c44..6627d21bd8 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1055,14 +1055,6 @@ emu_edge mmx ; int32_t max, unsigned int len) ;----------------------------------------------------------------------------- -%macro SPLATD_MMX 1 - punpckldq %1, %1 -%endmacro - -%macro SPLATD_SSE2 1 - pshufd %1, %1, 0 -%endmacro - %macro VECTOR_CLIP_INT32 4 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len %ifidn %1, sse2 diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 2deb577ca6..37e7a094ce 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -24,6 +24,146 @@ SECTION_TEXT +;--------------------------------------------------------------------------------- +; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); +;--------------------------------------------------------------------------------- +%macro INT32_TO_FLOAT_FMUL_SCALAR 2 +%ifdef ARCH_X86_64 +cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len +%else +cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len + movss m0, mulm +%endif + SPLATD m0 + shl lenq, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtdq2ps m1, [srcq+lenq ] + cvtdq2ps m2, [srcq+lenq+16] +%else + cvtpi2ps m1, [srcq+lenq ] + cvtpi2ps m3, [srcq+lenq+ 8] + cvtpi2ps m2, [srcq+lenq+16] + cvtpi2ps m4, [srcq+lenq+24] + movlhps m1, m3 + movlhps m2, m4 +%endif + mulps m1, m0 + mulps m2, m0 + mova [dstq+lenq ], m1 + mova [dstq+lenq+16], m2 + add lenq, 32 + jl .loop + REP_RET +%endmacro + +INIT_XMM +%define SPLATD SPLATD_SSE +%define movdqa movaps +INT32_TO_FLOAT_FMUL_SCALAR sse, 5 +%undef movdqa +%define SPLATD SPLATD_SSE2 +INT32_TO_FLOAT_FMUL_SCALAR sse2, 3 +%undef SPLATD + + +;------------------------------------------------------------------------------ +; void ff_float_to_int16(int16_t *dst, const float *src, long len); +;------------------------------------------------------------------------------ +%macro FLOAT_TO_INT16 2 +cglobal float_to_int16_%1, 3,3,%2, dst, src, len + add lenq, lenq + lea srcq, [srcq+2*lenq] + add dstq, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [srcq+2*lenq ] + cvtps2dq m1, [srcq+2*lenq+16] + packssdw m0, m1 + mova [dstq+lenq], m0 +%else + cvtps2pi m0, [srcq+2*lenq ] + cvtps2pi m1, [srcq+2*lenq+ 8] + cvtps2pi m2, [srcq+2*lenq+16] + cvtps2pi m3, [srcq+2*lenq+24] + packssdw m0, m1 + packssdw m2, m3 + mova [dstq+lenq ], m0 + mova [dstq+lenq+8], m2 +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_XMM +FLOAT_TO_INT16 sse2, 2 +INIT_MMX +FLOAT_TO_INT16 sse, 0 +%define cvtps2pi pf2id +FLOAT_TO_INT16 3dnow, 0 +%undef cvtps2pi + + +;------------------------------------------------------------------------------- +; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); +;------------------------------------------------------------------------------- +%macro FLOAT_TO_INT16_INTERLEAVE2 1 +cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len + lea lenq, [4*r2q] + mov src1q, [src0q+gprsize] + mov src0q, [src0q] + add dstq, lenq + add src0q, lenq + add src1q, lenq + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [src0q+lenq] + cvtps2dq m1, [src1q+lenq] + packssdw m0, m1 + movhlps m1, m0 + punpcklwd m0, m1 + mova [dstq+lenq], m0 +%else + cvtps2pi m0, [src0q+lenq ] + cvtps2pi m1, [src0q+lenq+8] + cvtps2pi m2, [src1q+lenq ] + cvtps2pi m3, [src1q+lenq+8] + packssdw m0, m1 + packssdw m2, m3 + mova m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + mova [dstq+lenq ], m0 + mova [dstq+lenq+8], m1 +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_MMX +%define cvtps2pi pf2id +FLOAT_TO_INT16_INTERLEAVE2 3dnow +%undef cvtps2pi +%define movdqa movaps +FLOAT_TO_INT16_INTERLEAVE2 sse +%undef movdqa +INIT_XMM +FLOAT_TO_INT16_INTERLEAVE2 sse2 + + %macro PSWAPD_SSE 2 pshufw %1, %2, 0x4e %endmacro diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index ba2c2c9bd5..a3d8f89816 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -26,133 +26,32 @@ #include "libavutil/x86_cpu.h" #include "libavcodec/fmtconvert.h" -static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtpi2ps (%2,%0), %%xmm0 \n" - "cvtpi2ps 8(%2,%0), %%xmm1 \n" - "cvtpi2ps 16(%2,%0), %%xmm2 \n" - "cvtpi2ps 24(%2,%0), %%xmm3 \n" - "movlhps %%xmm1, %%xmm0 \n" - "movlhps %%xmm3, %%xmm2 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm2 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm2, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - -static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtdq2ps (%2,%0), %%xmm0 \n" - "cvtdq2ps 16(%2,%0), %%xmm1 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm1 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm1, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} +#if HAVE_YASM -static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - // not bit-exact: pf2id uses different rounding than C and SSE - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "pf2id (%2,%0,2) , %%mm0 \n\t" - "pf2id 8(%2,%0,2) , %%mm1 \n\t" - "pf2id 16(%2,%0,2) , %%mm2 \n\t" - "pf2id 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "femms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} +void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); +void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); -static void float_to_int16_sse(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2pi (%2,%0,2) , %%mm0 \n\t" - "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" - "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" - "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "emms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} +void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); -static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" - "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" - "packssdw %%xmm1 , %%xmm0 \n\t" - "movdqa %%xmm0 , (%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} +void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); -#if !HAVE_YASM -#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) -#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#endif #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse -#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ DECLARE_ALIGNED(16, int16_t, tmp)[len];\ int i,j,c;\ for(c=0; cfloat_interleave = float_interleave_mmx; -#endif - if(mm_flags & AV_CPU_FLAG_3DNOW){ + if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16 = ff_float_to_int16_3dnow; c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } } - if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ + if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } - if(mm_flags & AV_CPU_FLAG_SSE){ - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; - c->float_to_int16 = float_to_int16_sse; + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; + c->float_to_int16 = ff_float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; -#if HAVE_YASM c->float_interleave = float_interleave_sse; -#endif } - if(mm_flags & AV_CPU_FLAG_SSE2){ - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = float_to_int16_sse2; + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; + c->float_to_int16 = ff_float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; } } +#endif } diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index bb0af86097..cc96cb1f3b 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -28,21 +28,20 @@ SECTION .text ;----------------------------------------------------------------------------- ; biweight pred: ; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); +; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int height, int log2_denom, int weightd, +; int weights, int offset); ; and -; void h264_weight_16x16_sse2(uint8_t *dst, int stride, -; int log2_denom, int weight, -; int offset); +; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, +; int log2_denom, int weight, int offset); ;----------------------------------------------------------------------------- %macro WEIGHT_SETUP 0 - add r4, r4 - inc r4 - movd m3, r3d - movd m5, r4d - movd m6, r2d + add r5, r5 + inc r5 + movd m3, r4d + movd m5, r5d + movd m6, r3d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -71,60 +70,41 @@ SECTION .text packuswb m0, m1 %endmacro -%macro WEIGHT_FUNC_DBL_MM 1 -cglobal h264_weight_16x%1_mmx2, 5, 5, 0 +INIT_MMX +cglobal h264_weight_16_mmx2, 6, 6, 0 WEIGHT_SETUP - mov r2, %1 -%if %1 == 16 .nextrow WEIGHT_OP 0, 4 mova [r0 ], m0 WEIGHT_OP 8, 12 mova [r0+8], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -WEIGHT_FUNC_DBL_MM 16 -WEIGHT_FUNC_DBL_MM 8 - -%macro WEIGHT_FUNC_MM 4 -cglobal h264_weight_%1x%2_%4, 7, 7, %3 +%macro WEIGHT_FUNC_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2 -%if %2 == 16 .nextrow WEIGHT_OP 0, mmsize/2 mova [r0], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_MM 8, 16, 0, mmx2 -WEIGHT_FUNC_MM 8, 8, 0, mmx2 -WEIGHT_FUNC_MM 8, 4, 0, mmx2 +WEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -WEIGHT_FUNC_MM 16, 16, 8, sse2 -WEIGHT_FUNC_MM 16, 8, 8, sse2 +WEIGHT_FUNC_MM 16, 8, sse2 -%macro WEIGHT_FUNC_HALF_MM 5 -cglobal h264_weight_%1x%2_%5, 5, 5, %4 +%macro WEIGHT_FUNC_HALF_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2/2 + sar r2d, 1 lea r3, [r1*2] -%if %2 == mmsize .nextrow WEIGHT_OP 0, r1 movh [r0], m0 @@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 movh [r0+r1], m0 %endif add r0, r3 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4d - movd m4, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m3, r5d + movd m4, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m1 %endmacro -%macro BIWEIGHT_FUNC_DBL_MM 1 -cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 +INIT_MMX +cglobal h264_biweight_16_mmx2, 7, 7, 0 BIWEIGHT_SETUP - mov r3, %1 -%if %1 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, 4 @@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 mova [r0+8], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -BIWEIGHT_FUNC_DBL_MM 16 -BIWEIGHT_FUNC_DBL_MM 8 - -%macro BIWEIGHT_FUNC_MM 4 -cglobal h264_biweight_%1x%2_%4, 7, 7, %3 +%macro BIWEIGHT_FUNC_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2 -%if %2 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, mmsize/2 @@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_MM 16, 16, 8, sse2 -BIWEIGHT_FUNC_MM 16, 8, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, sse2 -%macro BIWEIGHT_FUNC_HALF_MM 5 -cglobal h264_biweight_%1x%2_%5, 7, 7, %4 +%macro BIWEIGHT_FUNC_HALF_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %2 == mmsize .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, r2 @@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 %endif add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4d - movd m0, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m4, r5d + movd m0, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 punpcklbw m4, m0 @@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m2 %endmacro -%macro BIWEIGHT_SSSE3_16 1 -cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 +INIT_XMM +cglobal h264_biweight_16_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1 + movifnidn r3d, r3m -%if %1 == 16 .nextrow movh m0, [r0] movh m2, [r0+8] @@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) -%endif -%endmacro INIT_XMM -BIWEIGHT_SSSE3_16 16 -BIWEIGHT_SSSE3_16 8 - -%macro BIWEIGHT_SSSE3_8 1 -cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 +cglobal h264_biweight_8_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %1 == 16 .nextrow movh m0, [r0] movh m1, [r1] @@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 movhps [r0+r2], m0 add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) -%endif -%endmacro - -INIT_XMM -BIWEIGHT_SSSE3_8 16 -BIWEIGHT_SSSE3_8 8 -BIWEIGHT_SSSE3_8 4 diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm index 1c58d72d94..20df6fbab5 100644 --- a/libavcodec/x86/h264_weight_10bit.asm +++ b/libavcodec/x86/h264_weight_10bit.asm @@ -36,33 +36,26 @@ cextern pw_1 SECTION .text ;----------------------------------------------------------------------------- -; void h264_weight(uint8_t *dst, int stride, int log2_denom, +; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, ; int weight, int offset); ;----------------------------------------------------------------------------- -%ifdef ARCH_X86_32 -DECLARE_REG_TMP 2 -%else -DECLARE_REG_TMP 10 -%endif - -%macro WEIGHT_PROLOGUE 1 - mov t0, %1 +%macro WEIGHT_PROLOGUE 0 .prologue - PROLOGUE 0,5,8 + PROLOGUE 0,6,8 movifnidn r0, r0mp movifnidn r1d, r1m - movifnidn r3d, r3m movifnidn r4d, r4m + movifnidn r5d, r5m %endmacro %macro WEIGHT_SETUP 1 mova m0, [pw_1] - movd m2, r2m + movd m2, r3m pslld m0, m2 ; 1<h264_loop_filter_strength= h264_loop_filter_strength_mmx2; } @@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; #endif - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; if (mm_flags&AV_CPU_FLAG_SSE2) { c->h264_idct8_add = ff_h264_idct8_add_8_sse2; @@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; + c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; #if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; @@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom #endif } if (mm_flags&AV_CPU_FLAG_SSSE3) { - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; } if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) { #if HAVE_ALIGNED_STACK @@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; #endif - c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; - c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; - c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; - c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; - c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; - c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; - c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; - c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; - c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; - c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; - c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; @@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom #endif } if (mm_flags&AV_CPU_FLAG_SSE4) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; - c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; - c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; - c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; - c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; - c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; - c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; - c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; - c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; - c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; - c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; } #if HAVE_AVX if (mm_flags&AV_CPU_FLAG_AVX) { diff --git a/libavdevice/Makefile b/libavdevice/Makefile index 879e933994..97dd380776 100644 --- a/libavdevice/Makefile +++ b/libavdevice/Makefile @@ -10,7 +10,7 @@ OBJS = alldevices.o avdevice.o # input/output devices OBJS-$(CONFIG_ALSA_INDEV) += alsa-audio-common.o \ - alsa-audio-dec.o + alsa-audio-dec.o timefilter.o OBJS-$(CONFIG_ALSA_OUTDEV) += alsa-audio-common.o \ alsa-audio-enc.o OBJS-$(CONFIG_BKTR_INDEV) += bktr.o @@ -19,7 +19,7 @@ OBJS-$(CONFIG_DSHOW_INDEV) += dshow.o dshow_enummediatypes.o \ dshow_pin.o dshow_common.o OBJS-$(CONFIG_DV1394_INDEV) += dv1394.o OBJS-$(CONFIG_FBDEV_INDEV) += fbdev.o -OBJS-$(CONFIG_JACK_INDEV) += jack_audio.o +OBJS-$(CONFIG_JACK_INDEV) += jack_audio.o timefilter.o OBJS-$(CONFIG_LAVFI_INDEV) += lavfi.o OBJS-$(CONFIG_OPENAL_INDEV) += openal-dec.o OBJS-$(CONFIG_OSS_INDEV) += oss_audio.o @@ -39,4 +39,6 @@ OBJS-$(CONFIG_LIBDC1394_INDEV) += libdc1394.o SKIPHEADERS-$(HAVE_ALSA_ASOUNDLIB_H) += alsa-audio.h SKIPHEADERS-$(HAVE_SNDIO_H) += sndio_common.h +TESTPROGS = timefilter + include $(SRC_PATH)/subdir.mak diff --git a/libavdevice/alsa-audio.h b/libavdevice/alsa-audio.h index 83af464865..e453a2011b 100644 --- a/libavdevice/alsa-audio.h +++ b/libavdevice/alsa-audio.h @@ -33,7 +33,7 @@ #include #include "config.h" #include "libavutil/log.h" -#include "libavformat/timefilter.h" +#include "timefilter.h" #include "avdevice.h" /* XXX: we make the assumption that the soundcard accepts this format */ diff --git a/libavdevice/jack_audio.c b/libavdevice/jack_audio.c index 72554f0ebe..42499d363b 100644 --- a/libavdevice/jack_audio.c +++ b/libavdevice/jack_audio.c @@ -28,7 +28,8 @@ #include "libavutil/fifo.h" #include "libavutil/opt.h" #include "libavcodec/avcodec.h" -#include "libavformat/timefilter.h" +#include "libavformat/avformat.h" +#include "timefilter.h" #include "avdevice.h" /** diff --git a/libavformat/timefilter.c b/libavdevice/timefilter.c similarity index 99% rename from libavformat/timefilter.c rename to libavdevice/timefilter.c index 5c780c8d90..3c67a59881 100644 --- a/libavformat/timefilter.c +++ b/libavdevice/timefilter.c @@ -24,8 +24,8 @@ #include "config.h" -#include "avformat.h" #include "timefilter.h" +#include "libavutil/mem.h" struct TimeFilter { /// Delay Locked Loop data. These variables refer to mathematical diff --git a/libavformat/timefilter.h b/libavdevice/timefilter.h similarity index 97% rename from libavformat/timefilter.h rename to libavdevice/timefilter.h index ded8ec7bed..4580904092 100644 --- a/libavformat/timefilter.h +++ b/libavdevice/timefilter.h @@ -22,8 +22,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AVFORMAT_TIMEFILTER_H -#define AVFORMAT_TIMEFILTER_H +#ifndef AVDEVICE_TIMEFILTER_H +#define AVDEVICE_TIMEFILTER_H /** * Opaque type representing a time filter state @@ -94,4 +94,4 @@ void ff_timefilter_reset(TimeFilter *); */ void ff_timefilter_destroy(TimeFilter *); -#endif /* AVFORMAT_TIMEFILTER_H */ +#endif /* AVDEVICE_TIMEFILTER_H */ diff --git a/libavformat/Makefile b/libavformat/Makefile index ddd1ab7833..15ccb291ed 100644 --- a/libavformat/Makefile +++ b/libavformat/Makefile @@ -354,11 +354,8 @@ OBJS-$(CONFIG_RTP_PROTOCOL) += rtpproto.o OBJS-$(CONFIG_TCP_PROTOCOL) += tcp.o OBJS-$(CONFIG_UDP_PROTOCOL) += udp.o -# libavdevice dependencies -OBJS-$(CONFIG_ALSA_INDEV) += timefilter.o -OBJS-$(CONFIG_JACK_INDEV) += timefilter.o -TESTPROGS = seek timefilter +TESTPROGS = seek TOOLS = pktdumper probetest include $(SRC_PATH)/subdir.mak diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c index 7025ddd014..d48d367500 100644 --- a/libavformat/flvdec.c +++ b/libavformat/flvdec.c @@ -228,8 +228,9 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst case AMF_DATA_TYPE_OBJECT: { unsigned int keylen; - if (vstream && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1) - if (parse_keyframes_index(s, ioc, vstream, max_pos) < 0) + if ((vstream || astream) && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1) + if (parse_keyframes_index(s, ioc, vstream ? vstream : astream, + max_pos) < 0) av_log(s, AV_LOG_ERROR, "Keyframe index parsing failed\n"); while(avio_tell(ioc) < max_pos - 2 && (keylen = avio_rb16(ioc))) { diff --git a/libavformat/flvenc.c b/libavformat/flvenc.c index 98c4170311..627bb6d3ab 100644 --- a/libavformat/flvenc.c +++ b/libavformat/flvenc.c @@ -60,10 +60,13 @@ typedef struct FLVContext { int64_t duration_offset; int64_t filesize_offset; int64_t duration; - int delay; ///< first dts delay for AVC - int64_t last_ts; } FLVContext; +typedef struct FLVStreamContext { + int delay; ///< first dts delay for each stream (needed for AVC & Speex) + int64_t last_ts; ///< last timestamp for each stream +} FLVStreamContext; + static int get_audio_flags(AVCodecContext *enc){ int flags = (enc->bits_per_coded_sample == 16) ? FLV_SAMPLESSIZE_16BIT : FLV_SAMPLESSIZE_8BIT; @@ -182,6 +185,7 @@ static int flv_write_header(AVFormatContext *s) for(i=0; inb_streams; i++){ AVCodecContext *enc = s->streams[i]->codec; + FLVStreamContext *sc; if (enc->codec_type == AVMEDIA_TYPE_VIDEO) { if (s->streams[i]->r_frame_rate.den && s->streams[i]->r_frame_rate.num) { framerate = av_q2d(s->streams[i]->r_frame_rate); @@ -199,6 +203,12 @@ static int flv_write_header(AVFormatContext *s) return -1; } av_set_pts_info(s->streams[i], 32, 1, 1000); /* 32 bit pts in ms */ + + sc = av_mallocz(sizeof(FLVStreamContext)); + if (!sc) + return AVERROR(ENOMEM); + s->streams[i]->priv_data = sc; + sc->last_ts = -1; } avio_write(pb, "FLV", 3); avio_w8(pb,1); @@ -218,8 +228,6 @@ static int flv_write_header(AVFormatContext *s) } } - flv->last_ts = -1; - /* write meta_tag */ avio_w8(pb, 18); // tag type META metadata_size_pos= avio_tell(pb); @@ -361,9 +369,10 @@ static int flv_write_trailer(AVFormatContext *s) /* Add EOS tag */ for (i = 0; i < s->nb_streams; i++) { AVCodecContext *enc = s->streams[i]->codec; + FLVStreamContext *sc = s->streams[i]->priv_data; if (enc->codec_type == AVMEDIA_TYPE_VIDEO && (enc->codec_id == CODEC_ID_H264 || enc->codec_id == CODEC_ID_MPEG4)) { - put_avc_eos_tag(pb, flv->last_ts); + put_avc_eos_tag(pb, sc->last_ts); } } @@ -384,6 +393,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) AVIOContext *pb = s->pb; AVCodecContext *enc = s->streams[pkt->stream_index]->codec; FLVContext *flv = s->priv_data; + FLVStreamContext *sc = s->streams[pkt->stream_index]->priv_data; unsigned ts; int size= pkt->size; uint8_t *data= NULL; @@ -434,20 +444,20 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) av_log(s, AV_LOG_ERROR, "malformated aac bitstream, use -absf aac_adtstoasc\n"); return -1; } - if (!flv->delay && pkt->dts < 0) - flv->delay = -pkt->dts; + if (!sc->delay && pkt->dts < 0) + sc->delay = -pkt->dts; - ts = pkt->dts + flv->delay; // add delay to force positive dts + ts = pkt->dts + sc->delay; // add delay to force positive dts /* check Speex packet duration */ - if (enc->codec_id == CODEC_ID_SPEEX && ts - flv->last_ts > 160) { + if (enc->codec_id == CODEC_ID_SPEEX && ts - sc->last_ts > 160) { av_log(s, AV_LOG_WARNING, "Warning: Speex stream has more than " "8 frames per packet. Adobe Flash " "Player cannot handle this!\n"); } - if (flv->last_ts < ts) - flv->last_ts = ts; + if (sc->last_ts < ts) + sc->last_ts = ts; avio_wb24(pb,size + flags_size); avio_wb24(pb,ts); @@ -471,7 +481,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt) avio_write(pb, data ? data : pkt->data, size); avio_wb32(pb,size+flags_size+11); // previous tag size - flv->duration = FFMAX(flv->duration, pkt->pts + flv->delay + pkt->duration); + flv->duration = FFMAX(flv->duration, pkt->pts + sc->delay + pkt->duration); avio_flush(pb); diff --git a/libavformat/mov.c b/libavformat/mov.c index 3c8bbc9ca1..198f3cd938 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -35,6 +35,7 @@ #include "riff.h" #include "isom.h" #include "libavcodec/get_bits.h" +#include "id3v1.h" #if CONFIG_ZLIB #include @@ -99,31 +100,48 @@ static int mov_metadata_track_or_disc_number(MOVContext *c, AVIOContext *pb, return 0; } -static int mov_metadata_int8(MOVContext *c, AVIOContext *pb, - unsigned len, const char *key) +static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb, + unsigned len, const char *key) { - char buf[16]; + char buf[16]; + + /* bypass padding bytes */ + avio_r8(pb); + avio_r8(pb); + avio_r8(pb); + + snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); + av_dict_set(&c->fc->metadata, key, buf, 0); + + return 0; +} - /* bypass padding bytes */ - avio_r8(pb); - avio_r8(pb); - avio_r8(pb); +static int mov_metadata_int8_no_padding(MOVContext *c, AVIOContext *pb, + unsigned len, const char *key) +{ + char buf[16]; - snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); - av_dict_set(&c->fc->metadata, key, buf, 0); + snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); + av_dict_set(&c->fc->metadata, key, buf, 0); - return 0; + return 0; } -static int mov_metadata_stik(MOVContext *c, AVIOContext *pb, +static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb, unsigned len, const char *key) { - char buf[16]; + short genre; + char buf[20]; - snprintf(buf, sizeof(buf), "%hu", avio_r8(pb)); - av_dict_set(&c->fc->metadata, key, buf, 0); + avio_r8(pb); // unknown - return 0; + genre = avio_r8(pb); + if (genre < 1 || genre > ID3v1_GENRE_MAX) + return 0; + snprintf(buf, sizeof(buf), "%s", ff_id3v1_genre_str[genre-1]); + av_dict_set(&c->fc->metadata, key, buf, 0); + + return 0; } static const uint32_t mac_to_unicode[128] = { @@ -189,6 +207,8 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) case MKTAG(0xa9,'a','l','b'): key = "album"; break; case MKTAG(0xa9,'d','a','y'): key = "date"; break; case MKTAG(0xa9,'g','e','n'): key = "genre"; break; + case MKTAG( 'g','n','r','e'): key = "genre"; + parse = mov_metadata_gnre; break; case MKTAG(0xa9,'t','o','o'): case MKTAG(0xa9,'s','w','r'): key = "encoder"; break; case MKTAG(0xa9,'e','n','c'): key = "encoder"; break; @@ -202,11 +222,15 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom) case MKTAG( 'd','i','s','k'): key = "disc"; parse = mov_metadata_track_or_disc_number; break; case MKTAG( 't','v','e','s'): key = "episode_sort"; - parse = mov_metadata_int8; break; + parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 't','v','s','n'): key = "season_number"; - parse = mov_metadata_int8; break; + parse = mov_metadata_int8_bypass_padding; break; case MKTAG( 's','t','i','k'): key = "media_type"; - parse = mov_metadata_stik; break; + parse = mov_metadata_int8_no_padding; break; + case MKTAG( 'h','d','v','d'): key = "hd_video"; + parse = mov_metadata_int8_no_padding; break; + case MKTAG( 'p','g','a','p'): key = "gapless_playback"; + parse = mov_metadata_int8_no_padding; break; } if (c->itunes_metadata && atom.size > 8) { diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c index 13c2f0fd48..5d0a9dfaeb 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c @@ -859,6 +859,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = { }, .flags = PIX_FMT_BE, }, + [PIX_FMT_YUV422P9LE] = { + .name = "yuv422p9le", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,8}, /* Y */ + {1,1,1,0,8}, /* U */ + {2,1,1,0,8}, /* V */ + }, + }, + [PIX_FMT_YUV422P9BE] = { + .name = "yuv422p9be", + .nb_components= 3, + .log2_chroma_w= 1, + .log2_chroma_h= 0, + .comp = { + {0,1,1,0,8}, /* Y */ + {1,1,1,0,8}, /* U */ + {2,1,1,0,8}, /* V */ + }, + .flags = PIX_FMT_BE, + }, [PIX_FMT_YUV422P10LE] = { .name = "yuv422p10le", .nb_components= 3, diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h index f23d0c5054..25c9ef99ff 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h @@ -149,12 +149,15 @@ enum PixelFormat { PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian + PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian + PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian PIX_FMT_RGBA64BE, ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian PIX_FMT_RGBA64LE, ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian PIX_FMT_BGRA64BE, ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian PIX_FMT_BGRA64LE, ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian PIX_FMT_GBR24P, ///< planar GBR, 24bpp, 8G, 8B, 8R. + PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; @@ -182,6 +185,7 @@ enum PixelFormat { #define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE) #define PIX_FMT_YUV420P9 PIX_FMT_NE(YUV420P9BE , YUV420P9LE) +#define PIX_FMT_YUV422P9 PIX_FMT_NE(YUV422P9BE , YUV422P9LE) #define PIX_FMT_YUV444P9 PIX_FMT_NE(YUV444P9BE , YUV444P9LE) #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE) #define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE) diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index fc4781b532..c486b06425 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -536,6 +536,18 @@ %endif %endmacro +%macro SPLATD_MMX 1 + punpckldq %1, %1 +%endmacro + +%macro SPLATD_SSE 1 + shufps %1, %1, 0 +%endmacro + +%macro SPLATD_SSE2 1 + pshufd %1, %1, 0 +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 31346a3521..29b7b4c212 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -2843,6 +2843,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break; #if HAVE_BIGENDIAN case PIX_FMT_YUV444P9LE: + case PIX_FMT_YUV422P9LE: case PIX_FMT_YUV420P9LE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV420P10LE: @@ -2852,6 +2853,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break; #else case PIX_FMT_YUV444P9BE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV444P10BE: case PIX_FMT_YUV422P10BE: @@ -2912,6 +2914,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) switch (srcFormat) { #if HAVE_BIGENDIAN case PIX_FMT_YUV444P9LE: + case PIX_FMT_YUV422P9LE: case PIX_FMT_YUV420P9LE: case PIX_FMT_YUV422P10LE: case PIX_FMT_YUV420P10LE: @@ -2922,6 +2925,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c) case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break; #else case PIX_FMT_YUV444P9BE: + case PIX_FMT_YUV422P9BE: case PIX_FMT_YUV420P9BE: case PIX_FMT_YUV444P10BE: case PIX_FMT_YUV422P10BE: diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 0e62b38bcc..c8e8685f72 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -547,6 +547,8 @@ const char *sws_format_name(enum PixelFormat format); #define isNBPS(x) ( \ (x)==PIX_FMT_YUV420P9LE \ || (x)==PIX_FMT_YUV420P9BE \ + || (x)==PIX_FMT_YUV422P9LE \ + || (x)==PIX_FMT_YUV422P9BE \ || (x)==PIX_FMT_YUV444P9BE \ || (x)==PIX_FMT_YUV444P9LE \ || (x)==PIX_FMT_YUV422P10BE \ @@ -574,6 +576,7 @@ const char *sws_format_name(enum PixelFormat format); #define isPlanarYUV(x) ( \ isPlanar8YUV(x) \ || (x)==PIX_FMT_YUV420P9LE \ + || (x)==PIX_FMT_YUV422P9LE \ || (x)==PIX_FMT_YUV444P9LE \ || (x)==PIX_FMT_YUV420P10LE \ || (x)==PIX_FMT_YUV422P10LE \ @@ -583,6 +586,7 @@ const char *sws_format_name(enum PixelFormat format); || (x)==PIX_FMT_YUV422P16LE \ || (x)==PIX_FMT_YUV444P16LE \ || (x)==PIX_FMT_YUV420P9BE \ + || (x)==PIX_FMT_YUV422P9BE \ || (x)==PIX_FMT_YUV444P9BE \ || (x)==PIX_FMT_YUV420P10BE \ || (x)==PIX_FMT_YUV422P10BE \ diff --git a/libswscale/utils.c b/libswscale/utils.c index 6c2a8de120..673dae0adc 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -136,6 +136,8 @@ const static FormatEntry format_entries[PIX_FMT_NB] = { [PIX_FMT_YUV420P9LE] = { 1 , 1 }, [PIX_FMT_YUV420P10BE] = { 1 , 1 }, [PIX_FMT_YUV420P10LE] = { 1 , 1 }, + [PIX_FMT_YUV422P9BE] = { 1 , 1 }, + [PIX_FMT_YUV422P9LE] = { 1 , 1 }, [PIX_FMT_YUV422P10BE] = { 1 , 1 }, [PIX_FMT_YUV422P10LE] = { 1 , 1 }, [PIX_FMT_YUV444P9BE] = { 1 , 1 }, @@ -280,15 +282,18 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi if (flags & SWS_BICUBIC) { int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1<<24); int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24); - int64_t dd = ( d*d)>>30; - int64_t ddd= (dd*d)>>30; - if (d < 1LL<<30) - coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30); - else if (d < 1LL<<31) - coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30); - else - coeff=0.0; + if (d >= 1LL<<31) { + coeff = 0.0; + } else { + int64_t dd = (d * d) >> 30; + int64_t ddd = (dd * d) >> 30; + + if (d < 1LL<<30) + coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30); + else + coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30); + } coeff *= fone>>(30+24); } /* else if (flags & SWS_X) { diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index 36182a5ea9..eb478e2186 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -790,8 +790,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int y_table32 = c->yuvTable; yb = -(384<<16) - oy; for (i = 0; i < 1024; i++) { - uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16); - y_table32[i ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase)); + unsigned yval = av_clip_uint8((yb + 0x8000) >> 16); + y_table32[i ] = (yval << rbase) + (needAlpha ? 0 : (255u << abase)); y_table32[i+1024] = yval << gbase; y_table32[i+2048] = yval << bbase; yb += cy; diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc index 8bf3801524..0b18dfb7da 100644 --- a/tests/ref/lavfi/pixdesc +++ b/tests/ref/lavfi/pixdesc @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy index 8bf3801524..0b18dfb7da 100644 --- a/tests/ref/lavfi/pixfmts_copy +++ b/tests/ref/lavfi/pixfmts_copy @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null index 8bf3801524..0b18dfb7da 100644 --- a/tests/ref/lavfi/pixfmts_null +++ b/tests/ref/lavfi/pixfmts_null @@ -42,6 +42,8 @@ yuv422p10be bdc13b630fd668b34c6fe1aae28dfc71 yuv422p10le d0607c260a45c973e6639f4e449730ad yuv422p16be 4e9b3b3467aeebb6a528cee5966800ed yuv422p16le f87c81bf16916b64d201359be0b4b6f4 +yuv422p9be 29b71579946940a8c00fa844c9dff507 +yuv422p9le 062b7f9cbb972bf36b5bdb1a7623701a yuv440p 5a064afe2b453bb52cdb3f176b1aa1cf yuv444p 0a98447b78fd476aa39686da6a74fa2e yuv444p10be e65cbae7e4f1892c23defbc8e8052cf6 diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale index eee76706d3..e17544dbe6 100644 --- a/tests/ref/lavfi/pixfmts_scale +++ b/tests/ref/lavfi/pixfmts_scale @@ -42,6 +42,8 @@ yuv422p10be cea7ca6b0e66d6f29539885896c88603 yuv422p10le a10c4a5837547716f13cd61918b145f9 yuv422p16be 285993ee0c0f4f8e511ee46f93c5f38c yuv422p16le 61bfcee8e54465f760164f5a75d40b5e +yuv422p9be 82494823944912f73cebc58ad2979bbd +yuv422p9le fc69c8a21f473916a4b4225636b97e06 yuv440p 461503fdb9b90451020aa3b25ddf041c yuv444p 81b2eba962d12e8d64f003ac56f6faf2 yuv444p10be e9d3c8e744b8b0d8187ca092fa203fc9 diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip index 763e0bb069..c8301e380f 100644 --- a/tests/ref/lavfi/pixfmts_vflip +++ b/tests/ref/lavfi/pixfmts_vflip @@ -42,6 +42,8 @@ yuv422p10be 588fe319b96513c32e21d3e32b45447f yuv422p10le 11b57f2bd9661024153f3973b9090cdb yuv422p16be c092d083548c2a144c372a98c46875c7 yuv422p16le c071b9397a416d51cbe339345cbcba84 +yuv422p9be 7c6f1e140b3999ee7d923854e507752a +yuv422p9le 51f10d79c07989060dd06e767e6d7d60 yuv440p 876385e96165acf51271b20e5d85a416 yuv444p 9c3c667d1613b72d15bc6d851c5eb8f7 yuv444p10be 944a4997c4edb3a8dd0f0493cfd5a1fd