|
|
|
/*
|
|
|
|
* Copyright (c) 2006 Konstantin Shishkov
|
|
|
|
* Copyright (c) 2007 Loren Merritt
|
|
|
|
*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
* huffman tree builder and VLC generator
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
avcodec/huffman: replace qsort with AV_QSORT
ff_huff_build_tree uses qsort underneath. AV_QSORT is substantially
faster due to the inlining of the comparison callback. Furthermore, this
code is reasonably performance critical, since in e.g the fraps codec,
ff_huff_build_tree is called on every frame. This routine is also called
in vp6 on every frame in some circumstances.
Sample benchmark (x86-64, Haswell, GNU/Linux), vp6 from FATE:
vp6 (old):
78930 decicycles in qsort, 1 runs, 0 skips
45330 decicycles in qsort, 2 runs, 0 skips
27825 decicycles in qsort, 4 runs, 0 skips
17471 decicycles in qsort, 8 runs, 0 skips
12296 decicycles in qsort, 16 runs, 0 skips
9554 decicycles in qsort, 32 runs, 0 skips
8404 decicycles in qsort, 64 runs, 0 skips
7405 decicycles in qsort, 128 runs, 0 skips
6740 decicycles in qsort, 256 runs, 0 skips
7540 decicycles in qsort, 512 runs, 0 skips
9498 decicycles in qsort, 1024 runs, 0 skips
9938 decicycles in qsort, 2048 runs, 0 skips
8043 decicycles in qsort, 4095 runs, 1 skips
vp6 (new):
15880 decicycles in qsort, 1 runs, 0 skips
10730 decicycles in qsort, 2 runs, 0 skips
10155 decicycles in qsort, 4 runs, 0 skips
7805 decicycles in qsort, 8 runs, 0 skips
6883 decicycles in qsort, 16 runs, 0 skips
6305 decicycles in qsort, 32 runs, 0 skips
5854 decicycles in qsort, 64 runs, 0 skips
5152 decicycles in qsort, 128 runs, 0 skips
4452 decicycles in qsort, 256 runs, 0 skips
4161 decicycles in qsort, 511 runs, 1 skips
4081 decicycles in qsort, 1023 runs, 1 skips
4072 decicycles in qsort, 2047 runs, 1 skips
4004 decicycles in qsort, 4095 runs, 1 skips
Reviewed-by: Timothy Gu <timothygu99@gmail.com>
Reviewed-by: Michael Niedermayer <michael@niedermayer.cc>
Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
9 years ago
|
|
|
#include "libavutil/qsort.h"
|
|
|
|
#include "libavutil/common.h"
|
|
|
|
|
|
|
|
#include "avcodec.h"
|
|
|
|
#include "huffman.h"
|
|
|
|
#include "vlc.h"
|
|
|
|
|
|
|
|
/* symbol for Huffman tree node */
|
|
|
|
#define HNODE -1
|
|
|
|
|
|
|
|
typedef struct HeapElem {
|
|
|
|
uint64_t val;
|
|
|
|
int name;
|
|
|
|
} HeapElem;
|
|
|
|
|
|
|
|
static void heap_sift(HeapElem *h, int root, int size)
|
|
|
|
{
|
|
|
|
while (root * 2 + 1 < size) {
|
|
|
|
int child = root * 2 + 1;
|
|
|
|
if (child < size - 1 && h[child].val > h[child+1].val)
|
|
|
|
child++;
|
|
|
|
if (h[root].val > h[child].val) {
|
|
|
|
FFSWAP(HeapElem, h[root], h[child]);
|
|
|
|
root = child;
|
|
|
|
} else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int ff_huff_gen_len_table(uint8_t *dst, const uint64_t *stats, int stats_size, int skip0)
|
|
|
|
{
|
|
|
|
HeapElem *h = av_malloc_array(sizeof(*h), stats_size);
|
|
|
|
int *up = av_malloc_array(sizeof(*up) * 2, stats_size);
|
|
|
|
uint8_t *len = av_malloc_array(sizeof(*len) * 2, stats_size);
|
|
|
|
uint16_t *map= av_malloc_array(sizeof(*map), stats_size);
|
|
|
|
int offset, i, next;
|
|
|
|
int size = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!h || !up || !len || !map) {
|
|
|
|
ret = AVERROR(ENOMEM);
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i<stats_size; i++) {
|
|
|
|
dst[i] = 255;
|
|
|
|
if (stats[i] || !skip0)
|
|
|
|
map[size++] = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (offset = 1; ; offset <<= 1) {
|
|
|
|
for (i=0; i < size; i++) {
|
|
|
|
h[i].name = i;
|
|
|
|
h[i].val = (stats[map[i]] << 14) + offset;
|
|
|
|
}
|
|
|
|
for (i = size / 2 - 1; i >= 0; i--)
|
|
|
|
heap_sift(h, i, size);
|
|
|
|
|
|
|
|
for (next = size; next < size * 2 - 1; next++) {
|
|
|
|
// merge the two smallest entries, and put it back in the heap
|
|
|
|
uint64_t min1v = h[0].val;
|
|
|
|
up[h[0].name] = next;
|
|
|
|
h[0].val = INT64_MAX;
|
|
|
|
heap_sift(h, 0, size);
|
|
|
|
up[h[0].name] = next;
|
|
|
|
h[0].name = next;
|
|
|
|
h[0].val += min1v;
|
|
|
|
heap_sift(h, 0, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
len[2 * size - 2] = 0;
|
|
|
|
for (i = 2 * size - 3; i >= size; i--)
|
|
|
|
len[i] = len[up[i]] + 1;
|
|
|
|
for (i = 0; i < size; i++) {
|
|
|
|
dst[map[i]] = len[up[i]] + 1;
|
|
|
|
if (dst[map[i]] >= 32) break;
|
|
|
|
}
|
|
|
|
if (i==size) break;
|
|
|
|
}
|
|
|
|
end:
|
|
|
|
av_free(h);
|
|
|
|
av_free(up);
|
|
|
|
av_free(len);
|
|
|
|
av_free(map);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void get_tree_codes(uint32_t *bits, int16_t *lens, uint8_t *xlat,
|
|
|
|
Node *nodes, int node,
|
|
|
|
uint32_t pfx, int pl, int *pos, int no_zero_count)
|
|
|
|
{
|
|
|
|
int s;
|
|
|
|
|
|
|
|
s = nodes[node].sym;
|
|
|
|
if (s != HNODE || (no_zero_count && !nodes[node].count)) {
|
|
|
|
bits[*pos] = pfx;
|
|
|
|
lens[*pos] = pl;
|
|
|
|
xlat[*pos] = s;
|
|
|
|
(*pos)++;
|
|
|
|
} else {
|
|
|
|
pfx <<= 1;
|
|
|
|
pl++;
|
|
|
|
get_tree_codes(bits, lens, xlat, nodes, nodes[node].n0, pfx, pl,
|
|
|
|
pos, no_zero_count);
|
|
|
|
pfx |= 1;
|
|
|
|
get_tree_codes(bits, lens, xlat, nodes, nodes[node].n0 + 1, pfx, pl,
|
|
|
|
pos, no_zero_count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int build_huff_tree(VLC *vlc, Node *nodes, int head, int flags, int nb_bits)
|
|
|
|
{
|
|
|
|
int no_zero_count = !(flags & FF_HUFFMAN_FLAG_ZERO_COUNT);
|
|
|
|
uint32_t bits[256];
|
|
|
|
int16_t lens[256];
|
|
|
|
uint8_t xlat[256];
|
|
|
|
int pos = 0;
|
|
|
|
|
|
|
|
get_tree_codes(bits, lens, xlat, nodes, head, 0, 0,
|
|
|
|
&pos, no_zero_count);
|
|
|
|
return ff_init_vlc_sparse(vlc, nb_bits, pos, lens, 2, 2, bits, 4, 4, xlat, 1, 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nodes size must be 2*nb_codes
|
|
|
|
* first nb_codes nodes.count must be set
|
|
|
|
*/
|
|
|
|
int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes, int nb_bits,
|
|
|
|
Node *nodes, HuffCmp cmp, int flags)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
int cur_node;
|
|
|
|
int64_t sum = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < nb_codes; i++) {
|
|
|
|
nodes[i].sym = i;
|
|
|
|
nodes[i].n0 = -2;
|
|
|
|
sum += nodes[i].count;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sum >> 31) {
|
|
|
|
av_log(avctx, AV_LOG_ERROR,
|
|
|
|
"Too high symbol frequencies. "
|
|
|
|
"Tree construction is not possible\n");
|
|
|
|
return -1;
|
|
|
|
}
|
avcodec/huffman: replace qsort with AV_QSORT
ff_huff_build_tree uses qsort underneath. AV_QSORT is substantially
faster due to the inlining of the comparison callback. Furthermore, this
code is reasonably performance critical, since in e.g the fraps codec,
ff_huff_build_tree is called on every frame. This routine is also called
in vp6 on every frame in some circumstances.
Sample benchmark (x86-64, Haswell, GNU/Linux), vp6 from FATE:
vp6 (old):
78930 decicycles in qsort, 1 runs, 0 skips
45330 decicycles in qsort, 2 runs, 0 skips
27825 decicycles in qsort, 4 runs, 0 skips
17471 decicycles in qsort, 8 runs, 0 skips
12296 decicycles in qsort, 16 runs, 0 skips
9554 decicycles in qsort, 32 runs, 0 skips
8404 decicycles in qsort, 64 runs, 0 skips
7405 decicycles in qsort, 128 runs, 0 skips
6740 decicycles in qsort, 256 runs, 0 skips
7540 decicycles in qsort, 512 runs, 0 skips
9498 decicycles in qsort, 1024 runs, 0 skips
9938 decicycles in qsort, 2048 runs, 0 skips
8043 decicycles in qsort, 4095 runs, 1 skips
vp6 (new):
15880 decicycles in qsort, 1 runs, 0 skips
10730 decicycles in qsort, 2 runs, 0 skips
10155 decicycles in qsort, 4 runs, 0 skips
7805 decicycles in qsort, 8 runs, 0 skips
6883 decicycles in qsort, 16 runs, 0 skips
6305 decicycles in qsort, 32 runs, 0 skips
5854 decicycles in qsort, 64 runs, 0 skips
5152 decicycles in qsort, 128 runs, 0 skips
4452 decicycles in qsort, 256 runs, 0 skips
4161 decicycles in qsort, 511 runs, 1 skips
4081 decicycles in qsort, 1023 runs, 1 skips
4072 decicycles in qsort, 2047 runs, 1 skips
4004 decicycles in qsort, 4095 runs, 1 skips
Reviewed-by: Timothy Gu <timothygu99@gmail.com>
Reviewed-by: Michael Niedermayer <michael@niedermayer.cc>
Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
9 years ago
|
|
|
AV_QSORT(nodes, nb_codes, Node, cmp);
|
|
|
|
cur_node = nb_codes;
|
|
|
|
nodes[nb_codes*2-1].count = 0;
|
|
|
|
for (i = 0; i < nb_codes * 2 - 1; i += 2) {
|
|
|
|
uint32_t cur_count = nodes[i].count + nodes[i+1].count;
|
|
|
|
// find correct place to insert new node, and
|
|
|
|
// make space for the new node while at it
|
|
|
|
for(j = cur_node; j > i + 2; j--){
|
|
|
|
if(cur_count > nodes[j-1].count ||
|
|
|
|
(cur_count == nodes[j-1].count &&
|
|
|
|
!(flags & FF_HUFFMAN_FLAG_HNODE_FIRST)))
|
|
|
|
break;
|
|
|
|
nodes[j] = nodes[j - 1];
|
|
|
|
}
|
|
|
|
nodes[j].sym = HNODE;
|
|
|
|
nodes[j].count = cur_count;
|
|
|
|
nodes[j].n0 = i;
|
|
|
|
cur_node++;
|
|
|
|
}
|
|
|
|
if (build_huff_tree(vlc, nodes, nb_codes * 2 - 2, flags, nb_bits) < 0) {
|
|
|
|
av_log(avctx, AV_LOG_ERROR, "Error building tree\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|