From 40e26453c48303feaecb76c8c05f342b934cd676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reimar=20D=C3=B6ffinger?= Date: Mon, 21 Sep 2009 10:28:31 +0000 Subject: [PATCH] Use a custom radix sort implementation instead of qsort in dnxhd encoder. This is mainly to avoid test failures due to implementation-defined behaviour of qsort when elements are equal, giving different results for each of FreeBSD, Linux/glibc and Solaris. In addition it is about 35 % faster, effect on overall speed is minimal though (< 2%). Regression tests are unchanged (i.e. identical to Linux/glibc). Originally committed as revision 19949 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dnxhdenc.c | 57 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c index 3e68ef2dcc..16632b8d09 100644 --- a/libavcodec/dnxhdenc.c +++ b/libavcodec/dnxhdenc.c @@ -651,9 +651,60 @@ static int dnxhd_find_qscale(DNXHDEncContext *ctx) return 0; } -static int dnxhd_rc_cmp(const void *a, const void *b) +#define BUCKET_BITS 8 +#define RADIX_PASSES 4 +#define NBUCKETS (1 << BUCKET_BITS) + +static inline int get_bucket(int value, int shift) +{ + value >>= shift; + value &= NBUCKETS - 1; + return NBUCKETS - 1 - value; +} + +static void radix_count(const RCCMPEntry *data, int size, int buckets[RADIX_PASSES][NBUCKETS]) +{ + int i, j; + memset(buckets, 0, sizeof(buckets[0][0]) * RADIX_PASSES * NBUCKETS); + for (i = 0; i < size; i++) { + int v = data[i].value; + for (j = 0; j < RADIX_PASSES; j++) { + buckets[j][get_bucket(v, 0)]++; + v >>= BUCKET_BITS; + } + assert(!v); + } + for (j = 0; j < RADIX_PASSES; j++) { + int offset = size; + for (i = NBUCKETS - 1; i >= 0; i--) + buckets[j][i] = offset -= buckets[j][i]; + assert(!buckets[j][0]); + } +} + +static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data, int size, int buckets[NBUCKETS], int pass) +{ + int shift = pass * BUCKET_BITS; + int i; + for (i = 0; i < size; i++) { + int v = get_bucket(data[i].value, shift); + int pos = buckets[v]++; + dst[pos] = data[i]; + } +} + +static void radix_sort(RCCMPEntry *data, int size) { - return ((const RCCMPEntry *)b)->value - ((const RCCMPEntry *)a)->value; + int buckets[RADIX_PASSES][NBUCKETS]; + RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size); + radix_count(data, size, buckets); + radix_sort_pass(tmp, data, size, buckets[0], 0); + radix_sort_pass(data, tmp, size, buckets[1], 1); + if (buckets[2][NBUCKETS - 1] || buckets[3][NBUCKETS - 1]) { + radix_sort_pass(tmp, data, size, buckets[2], 2); + radix_sort_pass(data, tmp, size, buckets[3], 3); + } + av_free(tmp); } static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx) @@ -682,7 +733,7 @@ static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx) if (!ret) { if (RC_VARIANCE) avctx->execute(avctx, dnxhd_mb_var_thread, (void**)&ctx->thread[0], NULL, avctx->thread_count, sizeof(void*)); - qsort(ctx->mb_cmp, ctx->m.mb_num, sizeof(RCEntry), dnxhd_rc_cmp); + radix_sort(ctx->mb_cmp, ctx->m.mb_num); for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) { int mb = ctx->mb_cmp[x].mb; max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits;