@ -30,7 +30,6 @@
/***********************************
* TODOs :
* thresholds linearization after their modifications for attaining given bitrate
* try other bitrate controlling mechanism ( maybe use ratecontrol . c ? )
* control quality for quality - based output
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@ -41,10 +40,51 @@
*/
# define PSY_3GPP_THR_SPREAD_HI 1.5f // spreading factor for low-to-hi threshold spreading (15 dB/Bark)
# define PSY_3GPP_THR_SPREAD_LOW 3.0f // spreading factor for hi-to-low threshold spreading (30 dB/Bark)
/* spreading factor for low-to-hi energy spreading, long block, > 22kbps/channel (20dB/Bark) */
# define PSY_3GPP_EN_SPREAD_HI_L1 2.0f
/* spreading factor for low-to-hi energy spreading, long block, <= 22kbps/channel (15dB/Bark) */
# define PSY_3GPP_EN_SPREAD_HI_L2 1.5f
/* spreading factor for low-to-hi energy spreading, short block (15 dB/Bark) */
# define PSY_3GPP_EN_SPREAD_HI_S 1.5f
/* spreading factor for hi-to-low energy spreading, long block (30dB/Bark) */
# define PSY_3GPP_EN_SPREAD_LOW_L 3.0f
/* spreading factor for hi-to-low energy spreading, short block (20dB/Bark) */
# define PSY_3GPP_EN_SPREAD_LOW_S 2.0f
# define PSY_3GPP_RPEMIN 0.01f
# define PSY_3GPP_RPELEV 2.0f
# define PSY_3GPP_C1 3.0f /* log2(8) */
# define PSY_3GPP_C2 1.3219281f /* log2(2.5) */
# define PSY_3GPP_C3 0.55935729f /* 1 - C2 / C1 */
# define PSY_SNR_1DB 7.9432821e-1f /* -1dB */
# define PSY_SNR_25DB 3.1622776e-3f /* -25dB */
# define PSY_3GPP_SAVE_SLOPE_L -0.46666667f
# define PSY_3GPP_SAVE_SLOPE_S -0.36363637f
# define PSY_3GPP_SAVE_ADD_L -0.84285712f
# define PSY_3GPP_SAVE_ADD_S -0.75f
# define PSY_3GPP_SPEND_SLOPE_L 0.66666669f
# define PSY_3GPP_SPEND_SLOPE_S 0.81818181f
# define PSY_3GPP_SPEND_ADD_L -0.35f
# define PSY_3GPP_SPEND_ADD_S -0.26111111f
# define PSY_3GPP_CLIP_LO_L 0.2f
# define PSY_3GPP_CLIP_LO_S 0.2f
# define PSY_3GPP_CLIP_HI_L 0.95f
# define PSY_3GPP_CLIP_HI_S 0.75f
# define PSY_3GPP_AH_THR_LONG 0.5f
# define PSY_3GPP_AH_THR_SHORT 0.63f
enum {
PSY_3GPP_AH_NONE ,
PSY_3GPP_AH_INACTIVE ,
PSY_3GPP_AH_ACTIVE
} ;
# define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
/* LAME psy model constants */
# define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order
# define AAC_BLOCK_SIZE_LONG 1024 ///< long block size
@ -63,6 +103,12 @@ typedef struct AacPsyBand{
float energy ; ///< band energy
float thr ; ///< energy threshold
float thr_quiet ; ///< threshold in quiet
float nz_lines ; ///< number of non-zero spectral lines
float active_lines ; ///< number of active spectral lines
float pe ; ///< perceptual entropy
float pe_const ; ///< constant part of the PE calculation
float norm_fac ; ///< normalization factor for linearization
int avoid_holes ; ///< hole avoidance flag
} AacPsyBand ;
/**
@ -97,6 +143,15 @@ typedef struct AacPsyCoeffs{
* 3 GPP TS26 .403 - inspired psychoacoustic model specific data
*/
typedef struct AacPsyContext {
int chan_bitrate ; ///< bitrate per channel
int frame_bits ; ///< average bits per frame
int fill_level ; ///< bit reservoir fill level
struct {
float min ; ///< minimum allowed PE for bit factor calculation
float max ; ///< maximum allowed PE for bit factor calculation
float previous ; ///< allowed PE of the previous frame
float correction ; ///< PE correction factor
} pe ;
AacPsyCoeffs psy_coef [ 2 ] [ 64 ] ;
AacPsyChannel * ch ;
} AacPsyContext ;
@ -235,16 +290,33 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
AacPsyContext * pctx ;
float bark ;
int i , j , g , start ;
float prev , minscale , minath ;
float prev , minscale , minath , minsnr , pe_min ;
const int chan_bitrate = ctx - > avctx - > bit_rate / ctx - > avctx - > channels ;
const int bandwidth = ctx - > avctx - > cutoff ? ctx - > avctx - > cutoff : ctx - > avctx - > sample_rate / 2 ;
const float num_bark = calc_bark ( ( float ) bandwidth ) ;
ctx - > model_priv_data = av_mallocz ( sizeof ( AacPsyContext ) ) ;
pctx = ( AacPsyContext * ) ctx - > model_priv_data ;
pctx - > chan_bitrate = chan_bitrate ;
pctx - > frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx - > avctx - > sample_rate ;
pctx - > pe . min = 8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / ( ctx - > avctx - > sample_rate * 2.0f ) ;
pctx - > pe . max = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / ( ctx - > avctx - > sample_rate * 2.0f ) ;
ctx - > bitres . size = 6144 - pctx - > frame_bits ;
ctx - > bitres . size - = ctx - > bitres . size % 8 ;
pctx - > fill_level = ctx - > bitres . size ;
minath = ath ( 3410 , ATH_ADD ) ;
for ( j = 0 ; j < 2 ; j + + ) {
AacPsyCoeffs * coeffs = pctx - > psy_coef [ j ] ;
const uint8_t * band_sizes = ctx - > bands [ j ] ;
float line_to_frequency = ctx - > avctx - > sample_rate / ( j ? 256.f : 2048.0f ) ;
float avg_chan_bits = chan_bitrate / ctx - > avctx - > sample_rate * ( j ? 128.0f : 1024.0f ) ;
/* reference encoder uses 2.4% here instead of 60% like the spec says */
float bark_pe = 0.024f * PSY_3GPP_BITS_TO_PE ( avg_chan_bits ) / num_bark ;
float en_spread_low = j ? PSY_3GPP_EN_SPREAD_LOW_S : PSY_3GPP_EN_SPREAD_LOW_L ;
/* High energy spreading for long blocks <= 22kbps/channel and short blocks are the same. */
float en_spread_hi = ( j | | ( chan_bitrate < = 22.0f ) ) ? PSY_3GPP_EN_SPREAD_HI_S : PSY_3GPP_EN_SPREAD_HI_L1 ;
i = 0 ;
prev = 0.0 ;
for ( g = 0 ; g < ctx - > num_bands [ j ] ; g + + ) {
@ -258,6 +330,11 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
float bark_width = coeffs [ g + 1 ] . barks - coeffs - > barks ;
coeff - > spread_low [ 0 ] = pow ( 10.0 , - bark_width * PSY_3GPP_THR_SPREAD_LOW ) ;
coeff - > spread_hi [ 0 ] = pow ( 10.0 , - bark_width * PSY_3GPP_THR_SPREAD_HI ) ;
coeff - > spread_low [ 1 ] = pow ( 10.0 , - bark_width * en_spread_low ) ;
coeff - > spread_hi [ 1 ] = pow ( 10.0 , - bark_width * en_spread_hi ) ;
pe_min = bark_pe * bark_width ;
minsnr = pow ( 2.0f , pe_min / band_sizes [ g ] ) - 1.5f ;
coeff - > min_snr = av_clipf ( 1.0f / minsnr , PSY_SNR_25DB , PSY_SNR_1DB ) ;
}
start = 0 ;
for ( g = 0 ; g < ctx - > num_bands [ j ] ; g + + ) {
@ -385,6 +462,97 @@ static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
return wi ;
}
/* 5.6.1.2 "Calculation of Bit Demand" */
static int calc_bit_demand ( AacPsyContext * ctx , float pe , int bits , int size ,
int short_window )
{
const float bitsave_slope = short_window ? PSY_3GPP_SAVE_SLOPE_S : PSY_3GPP_SAVE_SLOPE_L ;
const float bitsave_add = short_window ? PSY_3GPP_SAVE_ADD_S : PSY_3GPP_SAVE_ADD_L ;
const float bitspend_slope = short_window ? PSY_3GPP_SPEND_SLOPE_S : PSY_3GPP_SPEND_SLOPE_L ;
const float bitspend_add = short_window ? PSY_3GPP_SPEND_ADD_S : PSY_3GPP_SPEND_ADD_L ;
const float clip_low = short_window ? PSY_3GPP_CLIP_LO_S : PSY_3GPP_CLIP_LO_L ;
const float clip_high = short_window ? PSY_3GPP_CLIP_HI_S : PSY_3GPP_CLIP_HI_L ;
float clipped_pe , bit_save , bit_spend , bit_factor , fill_level ;
ctx - > fill_level + = ctx - > frame_bits - bits ;
ctx - > fill_level = av_clip ( ctx - > fill_level , 0 , size ) ;
fill_level = av_clipf ( ( float ) ctx - > fill_level / size , clip_low , clip_high ) ;
clipped_pe = av_clipf ( pe , ctx - > pe . min , ctx - > pe . max ) ;
bit_save = ( fill_level + bitsave_add ) * bitsave_slope ;
assert ( bit_save < = 0.3f & & bit_save > = - 0.05000001f ) ;
bit_spend = ( fill_level + bitspend_add ) * bitspend_slope ;
assert ( bit_spend < = 0.5f & & bit_spend > = - 0.1f ) ;
/* The bit factor graph in the spec is obviously incorrect.
* bit_spend + ( ( bit_spend - bit_spend ) ) . . .
* The reference encoder subtracts everything from 1 , but also seems incorrect .
* 1 - bit_save + ( ( bit_spend + bit_save ) ) . . .
* Hopefully below is correct .
*/
bit_factor = 1.0f - bit_save + ( ( bit_spend - bit_save ) / ( ctx - > pe . max - ctx - > pe . min ) ) * ( clipped_pe - ctx - > pe . min ) ;
/* NOTE: The reference encoder attempts to center pe max/min around the current pe. */
ctx - > pe . max = FFMAX ( pe , ctx - > pe . max ) ;
ctx - > pe . min = FFMIN ( pe , ctx - > pe . min ) ;
return FFMIN ( ctx - > frame_bits * bit_factor , ctx - > frame_bits + size - bits ) ;
}
static float calc_pe_3gpp ( AacPsyBand * band )
{
float pe , a ;
band - > pe = 0.0f ;
band - > pe_const = 0.0f ;
band - > active_lines = 0.0f ;
if ( band - > energy > band - > thr ) {
a = log2f ( band - > energy ) ;
pe = a - log2f ( band - > thr ) ;
band - > active_lines = band - > nz_lines ;
if ( pe < PSY_3GPP_C1 ) {
pe = pe * PSY_3GPP_C3 + PSY_3GPP_C2 ;
a = a * PSY_3GPP_C3 + PSY_3GPP_C2 ;
band - > active_lines * = PSY_3GPP_C3 ;
}
band - > pe = pe * band - > nz_lines ;
band - > pe_const = a * band - > nz_lines ;
}
return band - > pe ;
}
static float calc_reduction_3gpp ( float a , float desired_pe , float pe ,
float active_lines )
{
float thr_avg , reduction ;
thr_avg = powf ( 2.0f , ( a - pe ) / ( 4.0f * active_lines ) ) ;
reduction = powf ( 2.0f , ( a - desired_pe ) / ( 4.0f * active_lines ) ) - thr_avg ;
return FFMAX ( reduction , 0.0f ) ;
}
static float calc_reduced_thr_3gpp ( AacPsyBand * band , float min_snr ,
float reduction )
{
float thr = band - > thr ;
if ( band - > energy > thr ) {
thr = powf ( thr , 0.25f ) + reduction ;
thr = powf ( thr , 4.0f ) ;
/* This deviates from the 3GPP spec to match the reference encoder.
* It performs min ( thr_reduced , max ( thr , energy / min_snr ) ) only for bands
* that have hole avoidance on ( active or inactive ) . It always reduces the
* threshold of bands with hole avoidance off .
*/
if ( thr > band - > energy * min_snr & & band - > avoid_holes ! = PSY_3GPP_AH_NONE ) {
thr = FFMAX ( band - > thr , band - > energy * min_snr ) ;
band - > avoid_holes = PSY_3GPP_AH_ACTIVE ;
}
}
return thr ;
}
/**
* Calculate band thresholds as suggested in 3 GPP TS26 .403
*/
@ -395,18 +563,27 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel,
AacPsyChannel * pch = & pctx - > ch [ channel ] ;
int start = 0 ;
int i , w , g ;
float desired_bits , desired_pe , delta_pe , reduction , spread_en [ 128 ] = { 0 } ;
float a = 0.0f , active_lines = 0.0f , norm_fac = 0.0f ;
float pe = pctx - > chan_bitrate > 32000 ? 0.0f : FFMAX ( 50.0f , 100.0f - pctx - > chan_bitrate * 100.0f / 32000.0f ) ;
const int num_bands = ctx - > num_bands [ wi - > num_windows = = 8 ] ;
const uint8_t * band_sizes = ctx - > bands [ wi - > num_windows = = 8 ] ;
AacPsyCoeffs * coeffs = pctx - > psy_coef [ wi - > num_windows = = 8 ] ;
const float avoid_hole_thr = wi - > num_windows = = 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG ;
//calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
float form_factor = 0.0f ;
band - > energy = 0.0f ;
for ( i = 0 ; i < band_sizes [ g ] ; i + + )
for ( i = 0 ; i < band_sizes [ g ] ; i + + ) {
band - > energy + = coefs [ start + i ] * coefs [ start + i ] ;
form_factor + = sqrtf ( fabs ( coefs [ start + i ] ) ) ;
}
band - > thr = band - > energy * 0.001258925f ;
band - > nz_lines = form_factor / powf ( band - > energy / band_sizes [ g ] , 0.25f ) ;
start + = band_sizes [ g ] ;
}
}
@ -414,10 +591,15 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel,
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
AacPsyBand * bands = & pch - > band [ w ] ;
//5.4.2.3 "Spreading" & 5.4.3 "Spreaded Energy Calculation"
for ( g = 1 ; g < num_bands ; g + + )
spread_en [ 0 ] = bands [ 0 ] . energy ;
for ( g = 1 ; g < num_bands ; g + + ) {
bands [ g ] . thr = FFMAX ( bands [ g ] . thr , bands [ g - 1 ] . thr * coeffs [ g ] . spread_hi [ 0 ] ) ;
for ( g = num_bands - 2 ; g > = 0 ; g - - )
spread_en [ w + g ] = FFMAX ( bands [ g ] . energy , spread_en [ w + g - 1 ] * coeffs [ g ] . spread_hi [ 1 ] ) ;
}
for ( g = num_bands - 2 ; g > = 0 ; g - - ) {
bands [ g ] . thr = FFMAX ( bands [ g ] . thr , bands [ g + 1 ] . thr * coeffs [ g ] . spread_low [ 0 ] ) ;
spread_en [ w + g ] = FFMAX ( spread_en [ w + g ] , spread_en [ w + g + 1 ] * coeffs [ g ] . spread_low [ 1 ] ) ;
}
//5.4.2.4 "Threshold in quiet"
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & bands [ g ] ;
@ -426,6 +608,119 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel,
if ( ! ( wi - > window_type [ 0 ] = = LONG_STOP_SEQUENCE | | ( wi - > window_type [ 1 ] = = LONG_START_SEQUENCE & & ! w ) ) )
band - > thr = FFMAX ( PSY_3GPP_RPEMIN * band - > thr , FFMIN ( band - > thr ,
PSY_3GPP_RPELEV * pch - > prev_band [ w + g ] . thr_quiet ) ) ;
/* 5.6.1.3.1 "Prepatory steps of the perceptual entropy calculation" */
pe + = calc_pe_3gpp ( band ) ;
a + = band - > pe_const ;
active_lines + = band - > active_lines ;
/* 5.6.1.3.3 "Selection of the bands for avoidance of holes" */
if ( spread_en [ w + g ] * avoid_hole_thr > band - > energy | | coeffs [ g ] . min_snr > 1.0f )
band - > avoid_holes = PSY_3GPP_AH_NONE ;
else
band - > avoid_holes = PSY_3GPP_AH_INACTIVE ;
}
}
/* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
ctx - > pe [ channel ] = pe ;
desired_bits = calc_bit_demand ( pctx , pe , ctx - > bitres . bits , ctx - > bitres . size , wi - > num_windows = = 8 ) ;
desired_pe = PSY_3GPP_BITS_TO_PE ( desired_bits ) ;
/* NOTE: PE correction is kept simple. During initial testing it had very
* little effect on the final bitrate . Probably a good idea to come
* back and do more testing later .
*/
if ( ctx - > bitres . bits > 0 )
desired_pe * = av_clipf ( pctx - > pe . previous / PSY_3GPP_BITS_TO_PE ( ctx - > bitres . bits ) ,
0.85f , 1.15f ) ;
pctx - > pe . previous = PSY_3GPP_BITS_TO_PE ( desired_bits ) ;
if ( desired_pe < pe ) {
/* 5.6.1.3.4 "First Estimation of the reduction value" */
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
reduction = calc_reduction_3gpp ( a , desired_pe , pe , active_lines ) ;
pe = 0.0f ;
a = 0.0f ;
active_lines = 0.0f ;
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
band - > thr = calc_reduced_thr_3gpp ( band , coeffs [ g ] . min_snr , reduction ) ;
/* recalculate PE */
pe + = calc_pe_3gpp ( band ) ;
a + = band - > pe_const ;
active_lines + = band - > active_lines ;
}
}
/* 5.6.1.3.5 "Second Estimation of the reduction value" */
for ( i = 0 ; i < 2 ; i + + ) {
float pe_no_ah = 0.0f , desired_pe_no_ah ;
active_lines = a = 0.0f ;
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
if ( band - > avoid_holes ! = PSY_3GPP_AH_ACTIVE ) {
pe_no_ah + = band - > pe ;
a + = band - > pe_const ;
active_lines + = band - > active_lines ;
}
}
}
desired_pe_no_ah = FFMAX ( desired_pe - ( pe - pe_no_ah ) , 0.0f ) ;
if ( active_lines > 0.0f )
reduction + = calc_reduction_3gpp ( a , desired_pe_no_ah , pe_no_ah , active_lines ) ;
pe = 0.0f ;
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
if ( active_lines > 0.0f )
band - > thr = calc_reduced_thr_3gpp ( band , coeffs [ g ] . min_snr , reduction ) ;
pe + = calc_pe_3gpp ( band ) ;
band - > norm_fac = band - > active_lines / band - > thr ;
norm_fac + = band - > norm_fac ;
}
}
delta_pe = desired_pe - pe ;
if ( fabs ( delta_pe ) > 0.05f * desired_pe )
break ;
}
if ( pe < 1.15f * desired_pe ) {
/* 6.6.1.3.6 "Final threshold modification by linearization" */
norm_fac = 1.0f / norm_fac ;
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
for ( g = 0 ; g < num_bands ; g + + ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
if ( band - > active_lines > 0.5f ) {
float delta_sfb_pe = band - > norm_fac * norm_fac * delta_pe ;
float thr = band - > thr ;
thr * = powf ( 2.0f , delta_sfb_pe / band - > active_lines ) ;
if ( thr > coeffs [ g ] . min_snr * band - > energy & & band - > avoid_holes = = PSY_3GPP_AH_INACTIVE )
thr = FFMAX ( band - > thr , coeffs [ g ] . min_snr * band - > energy ) ;
band - > thr = thr ;
}
}
}
} else {
/* 5.6.1.3.7 "Further perceptual entropy reduction" */
g = num_bands ;
while ( pe > desired_pe & & g - - ) {
for ( w = 0 ; w < wi - > num_windows * 16 ; w + = 16 ) {
AacPsyBand * band = & pch - > band [ w + g ] ;
if ( band - > avoid_holes ! = PSY_3GPP_AH_NONE & & coeffs [ g ] . min_snr < PSY_SNR_1DB ) {
coeffs [ g ] . min_snr = PSY_SNR_1DB ;
band - > thr = band - > energy * PSY_SNR_1DB ;
pe + = band - > active_lines * 1.5f - band - > pe ;
}
}
}
/* TODO: allow more holes (unused without mid/side) */
}
}