FFmpeg/libavutil/intmath.h

/*
 * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef AVUTIL_INTMATH_H
#define AVUTIL_INTMATH_H

#include <stdint.h>

#include "config.h"
#include "attributes.h"

#if ARCH_ARM
#   include "arm/intmath.h"
#endif
#if ARCH_X86
#   include "x86/intmath.h"
#endif

#if HAVE_FAST_CLZ
#if AV_GCC_VERSION_AT_LEAST(3,4)
#ifndef ff_log2
#   define ff_log2(x) (31 - __builtin_clz((x)|1))
#   ifndef ff_log2_16bit
#      define ff_log2_16bit av_log2
#   endif
#endif /* ff_log2 */
#endif /* AV_GCC_VERSION_AT_LEAST(3,4) */
#endif

extern const uint8_t ff_log2_tab[256];

#ifndef ff_log2
#define ff_log2 ff_log2_c
static av_always_inline av_const int ff_log2_c(unsigned int v)
{
    int n = 0;
    if (v & 0xffff0000) {
        v >>= 16;
        n += 16;
    }
    if (v & 0xff00) {
        v >>= 8;
        n += 8;
    }
    n += ff_log2_tab[v];

    return n;
}
#endif

#ifndef ff_log2_16bit
#define ff_log2_16bit ff_log2_16bit_c
static av_always_inline av_const int ff_log2_16bit_c(unsigned int v)
{
    int n = 0;
    if (v & 0xff00) {
        v >>= 8;
        n += 8;
    }
    n += ff_log2_tab[v];

    return n;
}
#endif

#define av_log2       ff_log2
#define av_log2_16bit ff_log2_16bit

/**
 * @addtogroup lavu_math
 * @{
 */

#if HAVE_FAST_CLZ
#if AV_GCC_VERSION_AT_LEAST(3,4)
#ifndef ff_ctz
#define ff_ctz(v) __builtin_ctz(v)
#endif
#ifndef ff_ctzll
#define ff_ctzll(v) __builtin_ctzll(v)
#endif
#endif
#endif

#ifndef ff_ctz
#define ff_ctz ff_ctz_c
/**
 * Trailing zero bit count.
 *
 * @param v  input value. If v is 0, the result is undefined.
 * @return   the number of trailing 0-bits
 */
/* We use the De-Bruijn method outlined in:
 * http://supertech.csail.mit.edu/papers/debruijn.pdf. */
static av_always_inline av_const int ff_ctz_c(int v)
{
    static const uint8_t debruijn_ctz32[32] = {
        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    };
    return debruijn_ctz32[(uint32_t)((v & -v) * 0x077CB531U) >> 27];
}
#endif

#ifndef ff_ctzll
#define ff_ctzll ff_ctzll_c
/* We use the De-Bruijn method outlined in:
 * http://supertech.csail.mit.edu/papers/debruijn.pdf. */
static av_always_inline av_const int ff_ctzll_c(long long v)
{
    static const uint8_t debruijn_ctz64[64] = {
        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
    };
    return debruijn_ctz64[(uint64_t)((v & -v) * 0x022FDD63CC95386DU) >> 58];
}
#endif

/**
 * @}
 */
#endif /* AVUTIL_INTMATH_H */
Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`/*`
			`* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#ifndef AVUTIL_INTMATH_H`
			`#define AVUTIL_INTMATH_H`

avutil: remove inline av_log2 from public API This removes inline av_log2 and av_log2_16bit from the public API, instead exporting them as regular functions. In-tree code still gets the inline and otherwise optimised variants. Signed-off-by: Mans Rullgard <mans@mansr.com> 12 years ago			`#include <stdint.h>`

Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "config.h"`
Fix build on configurations without fast av_log2() This is a bit hackish. I will try to think of something nicer, but this will do for now. Originally committed as revision 22366 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "attributes.h"`
Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
ARM: reinstate optimised intmath.h Use of the ARM optimised intmath.h was accidentally dropped in 9734b8b. Signed-off-by: Mans Rullgard <mans@mansr.com> 12 years ago			`#if ARCH_ARM`
			`# include "arm/intmath.h"`
			`#endif`
libavutil: add x86 optimized av_popcount Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com> 10 years ago			`#if ARCH_X86`
			`# include "x86/intmath.h"`
			`#endif`
ARM: reinstate optimised intmath.h Use of the ARM optimised intmath.h was accidentally dropped in 9734b8b. Signed-off-by: Mans Rullgard <mans@mansr.com> 12 years ago
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#if HAVE_FAST_CLZ`
lavu/intmath.h: Move x86 only msvc/icl functions to x86 specific header. Signed-off-by: Matt Oliver <protogonoi@gmail.com> 9 years ago			`#if AV_GCC_VERSION_AT_LEAST(3,4)`
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#ifndef ff_log2`
avutil/intmath: check for ICC before GCC Intel compiler also defines __GNUC__, so the Intel specific intrinsics were not really being used. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 9 years ago			`# define ff_log2(x) (31 - __builtin_clz((x)\|1))`
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`# ifndef ff_log2_16bit`
			`# define ff_log2_16bit av_log2`
			`# endif`
			`#endif /* ff_log2 */`
Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#endif /* AV_GCC_VERSION_AT_LEAST(3,4) */`
avutil/intmath: check for ICC before GCC Intel compiler also defines __GNUC__, so the Intel specific intrinsics were not really being used. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 9 years ago			`#endif`
Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
avutil: remove inline av_log2 from public API This removes inline av_log2 and av_log2_16bit from the public API, instead exporting them as regular functions. In-tree code still gets the inline and otherwise optimised variants. Signed-off-by: Mans Rullgard <mans@mansr.com> 12 years ago			`extern const uint8_t ff_log2_tab[256];`

			`#ifndef ff_log2`
			`#define ff_log2 ff_log2_c`
			`static av_always_inline av_const int ff_log2_c(unsigned int v)`
			`{`
			`int n = 0;`
			`if (v & 0xffff0000) {`
			`v >>= 16;`
			`n += 16;`
			`}`
			`if (v & 0xff00) {`
			`v >>= 8;`
			`n += 8;`
			`}`
			`n += ff_log2_tab[v];`

			`return n;`
			`}`
			`#endif`

			`#ifndef ff_log2_16bit`
			`#define ff_log2_16bit ff_log2_16bit_c`
			`static av_always_inline av_const int ff_log2_16bit_c(unsigned int v)`
			`{`
			`int n = 0;`
			`if (v & 0xff00) {`
			`v >>= 8;`
			`n += 8;`
			`}`
			`n += ff_log2_tab[v];`

			`return n;`
			`}`
			`#endif`

			`#define av_log2 ff_log2`
			`#define av_log2_16bit ff_log2_16bit`

lavu: add av_ctz() for trailing zero bit count 12 years ago			`/**`
			`* @addtogroup lavu_math`
			`* @{`
			`*/`

avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#if HAVE_FAST_CLZ`
lavu/intmath.h: Move x86 only msvc/icl functions to x86 specific header. Signed-off-by: Matt Oliver <protogonoi@gmail.com> 9 years ago			`#if AV_GCC_VERSION_AT_LEAST(3,4)`
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#ifndef ff_ctz`
avutil/intmath: check for ICC before GCC Intel compiler also defines __GNUC__, so the Intel specific intrinsics were not really being used. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 9 years ago			`#define ff_ctz(v) __builtin_ctz(v)`
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#endif`
avutil/mathematics: speed up av_gcd by using Stein's binary GCD algorithm This uses Stein's binary GCD algorithm: https://en.wikipedia.org/wiki/Binary_GCD_algorithm to get a roughly 4x speedup over Euclidean GCD on standard architectures with a compiler intrinsic for ctzll, and a roughly 2x speedup otherwise. At the moment, the compiler intrinsic is used on GCC and Clang due to its easy availability. Quick note regarding overflow: yes, subtractions on int64_t can, but the llabs takes care of that. The llabs is also guaranteed to be safe, with no annoying INT64_MIN business since INT64_MIN being a power of 2, is shifted down before being sent to llabs. The binary GCD needs ff_ctzll, an extension of ff_ctz for long long (int64_t). On GCC, this is provided by a built-in. On Microsoft, there is a BitScanForward64 analog of BitScanForward that should work; but I can't confirm. Apparently it is not available on 32 bit builds; so this may or may not work correctly. On Intel, per the documentation there is only an intrinsic for _bit_scan_forward and people have posted on forums regarding _bit_scan_forward64, but often their documentation is woeful. Again, I don't have it, so I can't test. As such, to be safe, for now only the GCC/Clang intrinsic is added, the rest use a compiled version based on the De-Bruijn method of Leiserson et al: http://supertech.csail.mit.edu/papers/debruijn.pdf. Tested with FATE, sample benchmark (x86-64, GCC 5.2.0, Haswell) with a START_TIMER and STOP_TIMER in libavutil/rationsl.c, followed by a make fate. aac-am00_88.err: builtin: 714 decicycles in av_gcd, 4095 runs, 1 skips de-bruijn: 1440 decicycles in av_gcd, 4096 runs, 0 skips previous: 2889 decicycles in av_gcd, 4096 runs, 0 skips Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`#ifndef ff_ctzll`
			`#define ff_ctzll(v) __builtin_ctzll(v)`
			`#endif`
avutil/intmath: enable builtin intrinsics for icl and msvc. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 10 years ago			`#endif`
lavu: add av_ctz() for trailing zero bit count 12 years ago			`#endif`

			`#ifndef ff_ctz`
			`#define ff_ctz ff_ctz_c`
intmath: remove av_ctz. It's a non-installed header and only used in one place (flacenc). Since ff_ctz is static inline, it's fine to use that instead. 9 years ago			`/**`
			`* Trailing zero bit count.`
			`*`
			`* @param v input value. If v is 0, the result is undefined.`
			`* @return the number of trailing 0-bits`
			`*/`
avutil/intmath: use de Bruijn based ff_ctz It has already been demonstrated that the de Bruijn method has benefits over the current implementation: commit 971d12b7f9d7be3ca8eb98e6c04ed521f83cbd3c. That commit implemented it for long long, this extends it to the int version. Tested with FATE. Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 9 years ago			`/* We use the De-Bruijn method outlined in:`
			`* http://supertech.csail.mit.edu/papers/debruijn.pdf. */`
lavu: add av_ctz() for trailing zero bit count 12 years ago			`static av_always_inline av_const int ff_ctz_c(int v)`
			`{`
avutil/intmath: use de Bruijn based ff_ctz It has already been demonstrated that the de Bruijn method has benefits over the current implementation: commit 971d12b7f9d7be3ca8eb98e6c04ed521f83cbd3c. That commit implemented it for long long, this extends it to the int version. Tested with FATE. Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 9 years ago			`static const uint8_t debruijn_ctz32[32] = {`
			`0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,`
			`31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9`
			`};`
			`return debruijn_ctz32[(uint32_t)((v & -v) * 0x077CB531U) >> 27];`
lavu: add av_ctz() for trailing zero bit count 12 years ago			`}`
			`#endif`

avutil/mathematics: speed up av_gcd by using Stein's binary GCD algorithm This uses Stein's binary GCD algorithm: https://en.wikipedia.org/wiki/Binary_GCD_algorithm to get a roughly 4x speedup over Euclidean GCD on standard architectures with a compiler intrinsic for ctzll, and a roughly 2x speedup otherwise. At the moment, the compiler intrinsic is used on GCC and Clang due to its easy availability. Quick note regarding overflow: yes, subtractions on int64_t can, but the llabs takes care of that. The llabs is also guaranteed to be safe, with no annoying INT64_MIN business since INT64_MIN being a power of 2, is shifted down before being sent to llabs. The binary GCD needs ff_ctzll, an extension of ff_ctz for long long (int64_t). On GCC, this is provided by a built-in. On Microsoft, there is a BitScanForward64 analog of BitScanForward that should work; but I can't confirm. Apparently it is not available on 32 bit builds; so this may or may not work correctly. On Intel, per the documentation there is only an intrinsic for _bit_scan_forward and people have posted on forums regarding _bit_scan_forward64, but often their documentation is woeful. Again, I don't have it, so I can't test. As such, to be safe, for now only the GCC/Clang intrinsic is added, the rest use a compiled version based on the De-Bruijn method of Leiserson et al: http://supertech.csail.mit.edu/papers/debruijn.pdf. Tested with FATE, sample benchmark (x86-64, GCC 5.2.0, Haswell) with a START_TIMER and STOP_TIMER in libavutil/rationsl.c, followed by a make fate. aac-am00_88.err: builtin: 714 decicycles in av_gcd, 4095 runs, 1 skips de-bruijn: 1440 decicycles in av_gcd, 4096 runs, 0 skips previous: 2889 decicycles in av_gcd, 4096 runs, 0 skips Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`#ifndef ff_ctzll`
			`#define ff_ctzll ff_ctzll_c`
			`/* We use the De-Bruijn method outlined in:`
			`* http://supertech.csail.mit.edu/papers/debruijn.pdf. */`
			`static av_always_inline av_const int ff_ctzll_c(long long v)`
			`{`
avutil/intmath: Change debruijn_ctz64 to use 8bit elements This reduces the memory & cache need from 256 to 64 bytes the code also seems faster with this change Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`static const uint8_t debruijn_ctz64[64] = {`
avutil/mathematics: speed up av_gcd by using Stein's binary GCD algorithm This uses Stein's binary GCD algorithm: https://en.wikipedia.org/wiki/Binary_GCD_algorithm to get a roughly 4x speedup over Euclidean GCD on standard architectures with a compiler intrinsic for ctzll, and a roughly 2x speedup otherwise. At the moment, the compiler intrinsic is used on GCC and Clang due to its easy availability. Quick note regarding overflow: yes, subtractions on int64_t can, but the llabs takes care of that. The llabs is also guaranteed to be safe, with no annoying INT64_MIN business since INT64_MIN being a power of 2, is shifted down before being sent to llabs. The binary GCD needs ff_ctzll, an extension of ff_ctz for long long (int64_t). On GCC, this is provided by a built-in. On Microsoft, there is a BitScanForward64 analog of BitScanForward that should work; but I can't confirm. Apparently it is not available on 32 bit builds; so this may or may not work correctly. On Intel, per the documentation there is only an intrinsic for _bit_scan_forward and people have posted on forums regarding _bit_scan_forward64, but often their documentation is woeful. Again, I don't have it, so I can't test. As such, to be safe, for now only the GCC/Clang intrinsic is added, the rest use a compiled version based on the De-Bruijn method of Leiserson et al: http://supertech.csail.mit.edu/papers/debruijn.pdf. Tested with FATE, sample benchmark (x86-64, GCC 5.2.0, Haswell) with a START_TIMER and STOP_TIMER in libavutil/rationsl.c, followed by a make fate. aac-am00_88.err: builtin: 714 decicycles in av_gcd, 4095 runs, 1 skips de-bruijn: 1440 decicycles in av_gcd, 4096 runs, 0 skips previous: 2889 decicycles in av_gcd, 4096 runs, 0 skips Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,`
			`62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,`
			`63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,`
			`51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12`
			`};`
avutil/intmath: fix undefined behavior in ff_ctzll_c() Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`return debruijn_ctz64[(uint64_t)((v & -v) * 0x022FDD63CC95386DU) >> 58];`
avutil/mathematics: speed up av_gcd by using Stein's binary GCD algorithm This uses Stein's binary GCD algorithm: https://en.wikipedia.org/wiki/Binary_GCD_algorithm to get a roughly 4x speedup over Euclidean GCD on standard architectures with a compiler intrinsic for ctzll, and a roughly 2x speedup otherwise. At the moment, the compiler intrinsic is used on GCC and Clang due to its easy availability. Quick note regarding overflow: yes, subtractions on int64_t can, but the llabs takes care of that. The llabs is also guaranteed to be safe, with no annoying INT64_MIN business since INT64_MIN being a power of 2, is shifted down before being sent to llabs. The binary GCD needs ff_ctzll, an extension of ff_ctz for long long (int64_t). On GCC, this is provided by a built-in. On Microsoft, there is a BitScanForward64 analog of BitScanForward that should work; but I can't confirm. Apparently it is not available on 32 bit builds; so this may or may not work correctly. On Intel, per the documentation there is only an intrinsic for _bit_scan_forward and people have posted on forums regarding _bit_scan_forward64, but often their documentation is woeful. Again, I don't have it, so I can't test. As such, to be safe, for now only the GCC/Clang intrinsic is added, the rest use a compiled version based on the De-Bruijn method of Leiserson et al: http://supertech.csail.mit.edu/papers/debruijn.pdf. Tested with FATE, sample benchmark (x86-64, GCC 5.2.0, Haswell) with a START_TIMER and STOP_TIMER in libavutil/rationsl.c, followed by a make fate. aac-am00_88.err: builtin: 714 decicycles in av_gcd, 4095 runs, 1 skips de-bruijn: 1440 decicycles in av_gcd, 4096 runs, 0 skips previous: 2889 decicycles in av_gcd, 4096 runs, 0 skips Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 9 years ago			`}`
			`#endif`

doxy: provide a start page and document libavutil Introduce a basic layout, the subpages are currently left empty. Split libavutil in multiple groups as example of the structure 13 years ago			`/**`
			`* @}`
			`*/`
Optimise av_log2 with clz when available 10% faster flac decoding on x86 and ARM. Originally committed as revision 21217 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#endif /* AVUTIL_INTMATH_H */`