@ -64,36 +64,37 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
const int step = height * width ;
__global const Dtype* in_off = in + offset ;
__global Dtype* out_off = out + offset ;
KERNEL_ARG_DTYPE scale_val ;
int head = 0 ;
const int pre_pad = ( size - 1 ) / 2 ;
const int post_pad = size - pre_pad - 1 ;
KERNEL_ARG_DTYPE accum_scale = 0 ;
float accum_scale = 0 ;
// fill the scale at [n, :, h, w]
// accumulate values
while ( head < post_pad && head < channels ) {
accum_scale += in_off[head * step] * in_off[head * step] ;
float v = in_off[head * step] ;
accum_scale += v * v ;
++head ;
}
// both add and subtract
while ( head < channels ) {
accum_scale += in_off[head * step] * in_off[head * step] ;
float v = in_off[head * step] ;
accum_scale += v * v ;
if ( head - size >= 0 ) {
accum_scale - = in_off[ ( head - size ) * step]
* in_off[ ( head - size ) * step] ;
v = in_off[ ( head - size ) * step] ;
accum_scale -= v * v ;
}
scale_val = k + accum_scale * alpha_over_size ;
out_off[ ( head - post_pad ) * step] = in_off[ ( head - post_pad ) * step] * ( Dtype ) native_powr ( scale_val, negative_beta ) ;
float scale_val = k + accum_scale * alpha_over_size ;
out_off[ ( head - post_pad ) * step] = ( Dtype ) ( ( float ) in_off[ ( head - post_pad ) * step] * native_powr ( scale_val, negative_beta ) ) ;
++head ;
}
// subtract only
while ( head < channels + post_pad ) {
if ( head - size >= 0 ) {
accum_scale - = in_off[ ( head - size ) * step]
* in_off[ ( head - size ) * step] ;
float v = in_off[ ( head - size ) * step] ;
accum_scale -= v * v ;
}
scale_val = k + accum_scale * alpha_over_size ;
out_off[ ( head - post_pad ) * step] = in_off[ ( head - post_pad ) * step] * ( Dtype ) native_powr ( scale_val, negative_beta ) ;
float scale_val = k + accum_scale * alpha_over_size ;
out_off[ ( head - post_pad ) * step] = ( Dtype ) ( ( float ) in_off[ ( head - post_pad ) * step] * native_powr ( scale_val, negative_beta ) ) ;
++head ;
}
}