|
|
|
@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid) |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid == 0) |
|
|
|
|
{ |
|
|
|
|
for (int i = 1; i < 256; ++i) |
|
|
|
|
{ |
|
|
|
|
smem[i] += smem[i - 1]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
return smem[tid]; |
|
|
|
@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid) |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 128) |
|
|
|
|
{ |
|
|
|
|
smem[tid] = val += smem[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem[tid] = val += smem[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
|
{ |
|
|
|
|
smem[tid] += smem[tid + 32]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
smem[tid] += smem[tid + 16]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
smem[tid] += smem[tid + 8]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 4) |
|
|
|
|
{ |
|
|
|
|
smem[tid] += smem[tid + 4]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 2) |
|
|
|
|
{ |
|
|
|
|
smem[tid] += smem[tid + 2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 1) |
|
|
|
|
{ |
|
|
|
|
smem[256] = smem[tid] + smem[tid + 1]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
|
|
void reduce(__local volatile int* smem, int val, int tid) |
|
|
|
|
{ |
|
|
|
|
smem[tid] = val; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 128) |
|
|
|
|
{ |
|
|
|
|
smem[tid] = val += smem[tid + 128]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 64) |
|
|
|
|
{ |
|
|
|
|
smem[tid] = val += smem[tid + 64]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 32) |
|
|
|
@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid) |
|
|
|
|
smem[tid] += smem[tid + 32]; |
|
|
|
|
#if WAVE_SIZE < 32 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 16) { |
|
|
|
|
|
|
|
|
|
if (tid < 16) |
|
|
|
|
{ |
|
|
|
|
#endif |
|
|
|
|
smem[tid] += smem[tid + 16]; |
|
|
|
|
#if WAVE_SIZE < 16 |
|
|
|
|
} barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if (tid < 8) { |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (tid < 8) |
|
|
|
|
{ |
|
|
|
|
#endif |
|
|
|
|
smem[tid] += smem[tid + 8]; |
|
|
|
|
smem[tid] += smem[tid + 4]; |
|
|
|
@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid) |
|
|
|
|
__kernel void calcLut(__global __const uchar * src, __global uchar * lut, |
|
|
|
|
const int srcStep, const int dstStep, |
|
|
|
|
const int2 tileSize, const int tilesX, |
|
|
|
|
const int clipLimit, const float lutScale) |
|
|
|
|
const int clipLimit, const float lutScale, |
|
|
|
|
const int src_offset, const int dst_offset) |
|
|
|
|
{ |
|
|
|
|
__local int smem[512]; |
|
|
|
|
|
|
|
|
@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, |
|
|
|
|
|
|
|
|
|
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) |
|
|
|
|
{ |
|
|
|
|
__global const uchar* srcPtr = src + mad24( ty * tileSize.y + i, |
|
|
|
|
srcStep, tx * tileSize.x ); |
|
|
|
|
__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); |
|
|
|
|
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) |
|
|
|
|
{ |
|
|
|
|
const int data = srcPtr[j]; |
|
|
|
|
atomic_inc(&smem[data]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
int tHistVal = smem[tid]; |
|
|
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if (clipLimit > 0) |
|
|
|
|
{ |
|
|
|
|
// clip histogram bar |
|
|
|
|
|
|
|
|
|
int clipped = 0; |
|
|
|
|
if (tHistVal > clipLimit) |
|
|
|
|
{ |
|
|
|
@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// find number of overall clipped samples |
|
|
|
|
|
|
|
|
|
reduce(smem, clipped, tid); |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
#ifdef CPU |
|
|
|
@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, |
|
|
|
|
|
|
|
|
|
const int lutVal = calc_lut(smem, tHistVal, tid); |
|
|
|
|
uint ires = (uint)convert_int_rte(lutScale * lutVal); |
|
|
|
|
lut[(ty * tilesX + tx) * dstStep + tid] = |
|
|
|
|
lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = |
|
|
|
|
convert_uchar(clamp(ires, (uint)0, (uint)255)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src, |
|
|
|
|
const int srcStep, const int dstStep, const int lutStep, |
|
|
|
|
const int cols, const int rows, |
|
|
|
|
const int2 tileSize, |
|
|
|
|
const int tilesX, const int tilesY) |
|
|
|
|
const int tilesX, const int tilesY, |
|
|
|
|
const int src_offset, const int dst_offset, int lut_offset) |
|
|
|
|
{ |
|
|
|
|
const int x = get_global_id(0); |
|
|
|
|
const int y = get_global_id(1); |
|
|
|
@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src, |
|
|
|
|
tx1 = max(tx1, 0); |
|
|
|
|
tx2 = min(tx2, tilesX - 1); |
|
|
|
|
|
|
|
|
|
const int srcVal = src[mad24(y, srcStep, x)]; |
|
|
|
|
const int srcVal = src[mad24(y, srcStep, x + src_offset)]; |
|
|
|
|
|
|
|
|
|
float res = 0; |
|
|
|
|
|
|
|
|
|
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya)); |
|
|
|
|
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya)); |
|
|
|
|
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya)); |
|
|
|
|
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya)); |
|
|
|
|
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); |
|
|
|
|
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); |
|
|
|
|
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); |
|
|
|
|
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); |
|
|
|
|
|
|
|
|
|
uint ires = (uint)convert_int_rte(res); |
|
|
|
|
dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); |
|
|
|
|
dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); |
|
|
|
|
} |
|
|
|
|