From 9db28f332acf53eb2001249af33919b4ac742377 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 17 Feb 2014 19:27:34 +0400 Subject: [PATCH] more fixes --- .../src/fast_nlmeans_denoising_invoker.hpp | 2 + .../src/fast_nlmeans_denoising_opencl.hpp | 10 +- modules/photo/src/opencl/nlmeans.cl | 179 ++++++++++-------- 3 files changed, 104 insertions(+), 87 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index b8f5a03925..2ea09719e0 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -137,6 +137,8 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); + printf("%d ", weight); + if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) weight = 0; diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 404a59ee09..cd28c1489a 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -111,15 +111,17 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, _dst.create(size, type); UMat dst = _dst.getUMat(); - Size upColSumSize(size.width, searchWindowSize * searchWindowSize * nblocksy); - Size colSumSize(nblocksx * templateWindowSize, searchWindowSize * searchWindowSize * nblocksy); + int searchWindowSizeSq = searchWindowSize * searchWindowSize; + Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); + Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy); UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); + srcex = srcex(Rect(Point(borderSize, borderSize), size)); k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(almostDist2Weight), nblocksy, nblocksx, + ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); - size_t globalsize[2] = { nblocksx, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; + size_t globalsize[2] = { nblocksx * BLOCK_COLS, nblocksy * BLOCK_ROWS }, localsize[2] = { CTA_SIZE, 1 }; return k.run(2, globalsize, localsize, false); } diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 1b40517abe..807e89a62f 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -5,6 +5,8 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. +#pragma OPENCL_EXTENSION cl_amd_printf:enable + #ifdef OP_CALC_WEIGHTS __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, @@ -18,6 +20,8 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost float dist = almostDist * almostDist2ActualDistMultiplier; int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); +// printf("%d ", weight); + if (weight < WEIGHT_THRESHOLD * fixedPointMult) weight = 0; @@ -31,7 +35,7 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost inline int_t calcDist(uchar_t a, uchar_t b) { - int_t diff = convert_int_t(a) -convert_int_t(b); + int_t diff = convert_int_t(a) - convert_int_t(b); return diff * diff; } @@ -39,16 +43,14 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int __local int_t * dists, int y, int x, int id, __global int_t * col_dists, __global int_t * up_col_dists) { - int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2 - TEMPLATE_SIZE2; - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { int_t dist = (int_t)(0), value; - sx += i % SEARCH_SIZE; - sy += i / SEARCH_SIZE; - - __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; @@ -57,7 +59,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int col_dists_current[j] = (int_t)(0); #pragma unroll - for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { #pragma unroll for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx) @@ -68,78 +70,86 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int dist += value; } - src_current += src_step; - src_template += src_step; + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); } dists[i] = dist; - up_col_dists[i] = col_dists[TEMPLATE_SIZE - 1]; + up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; } } +#define COND if (i == 252 && x0 == 20) + inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x, int id, int first, + __local int_t * dists, int y, int x0, int x, int id, int first, __global int_t * col_dists, __global int_t * up_col_dists) { x += TEMPLATE_SIZE2; + y -= TEMPLATE_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - sx += i % SEARCH_SIZE; - sy += i / SEARCH_SIZE; - __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); - __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); __global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i; - int_t value; - dists[id] -= col_dists_current[first]; - col_dists_current[first] = (int_t)(0); + int_t col_dist = (int_t)(0); #pragma unroll - for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { - value = calcDist(src_current[0], src_template[0]); - col_dists_current[first] += value; + col_dist += calcDist(src_current[0], src_template[0]); +// COND printf("%d\n", calcDist(src_current[0], src_template[0])); - src_current += src_step; - src_template += src_step; + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); } - dists[id] += col_dists_current[first]; - up_col_dists[id] = col_dists_current[first]; + dists[i] += col_dist - col_dists_current[first]; + col_dists_current[first] = col_dist; + up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist; +// COND printf("res = %d\n", col_dist); } } inline void calcElement(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x, int id, int first, + __local int_t * dists, int y, int x0, int x, int id, int first, __global int_t * col_dists, __global int_t * up_col_dists) { - int sx_up = x + TEMPLATE_SIZE2, sy_up = y - TEMPLATE_SIZE2 - 1; - int sx_down = x + TEMPLATE_SIZE2, sy_down = y + TEMPLATE_SIZE2; + int sx = x + TEMPLATE_SIZE2; + int sy_up = y - TEMPLATE_SIZE2 - 1 /*- TEMPLATE_SIZE*/; + int sy_down = y + TEMPLATE_SIZE2; - uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); - uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset))); + uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset))); - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + sx -= SEARCH_SIZE2; + sy_up -= SEARCH_SIZE2; + sy_down -= SEARCH_SIZE2; + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - int wx = i % SEARCH_SIZE; - int wy = i / SEARCH_SIZE; + int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE; - sx_up += wx, sx_down += wx; - sy_up += wy, sy_down += wy; + uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset))); + uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset))); - uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); - uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); + __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); - __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; - __global int_t * up_col_dists_current = up_col_dists + i; +// COND printf("\nres = %d\n", up_col_dists_current[0]); +// COND printf("up = %d, down = %d\n", calcDist(up_value, up_value_t), calcDist(down_value, down_value_t)); + int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); + + dists[i] += col_dist - col_dists_current[0]; + col_dists_current[0] = col_dist; + up_col_dists_current[0] = col_dist; - dists[i] -= col_dists_current[first]; - col_dists_current[first] = up_col_dists_current[id] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); - dists[i] += col_dists_current[first]; - up_col_dists_current[id] = col_dists_current[first]; +// COND printf("res = %d\n", up_col_dists_current[0]); +// if (up_col_dists_current[0] < 0) printf("%d %d -- %d\n", i, x0, up_col_dists_current[0]); } } @@ -147,27 +157,28 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off __local int * dists, __global const int * almostDist2Weight, __global uchar * dst, int dst_step, int dst_offset, int y, int x, int id, __local int * weights_local, - __local int * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) + __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) { int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; int_t weighted_sum = (int_t)(0); - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += id) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - int src_index = mad24(sy + i / SEARCH_SIZE, src_step, (i % SEARCH_SIZE + sx) * cn + src_offset); - __global const uchar_t * src_search = (__global const uchar_t *)(src + src_index); + int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset)); + int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index)); int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int weight = almostDist2Weight[almostAvgDist]; weights += weight; - weighted_sum += (int_t)(weight) * convert_int_t(src_search[0]); + weighted_sum += (int_t)(weight) * src_value; } if (id >= CTA_SIZE2) { - weights_local[id - CTA_SIZE2] = weights; - weighted_sum_local[id - CTA_SIZE2] = weighted_sum; + int id2 = id - CTA_SIZE2; + weights_local[id2] = weights; + weighted_sum_local[id2] = weighted_sum; } barrier(CLK_LOCAL_MEM_FENCE); @@ -191,9 +202,9 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off if (id == 0) { - int dst_index = mad24(y, dst_step, dst_offset + x * cn); + int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - int_t weights_local_0 = (int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); + int_t weights_local_0 = (int_t)(1);//(int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; *(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0); @@ -202,48 +213,50 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __global const int * almostDist2Weight, int nblocksy, int nblocksx, - __global uchar * buffer, int almostTemplateWindowSizeSqBinShift) + __global const int * almostDist2Weight, __global uchar * buffer, + int almostTemplateWindowSizeSqBinShift) { - int block_x = get_global_id(0); - int block_y = get_global_id(1); + int block_x = get_group_id(0), nblocks_x = get_num_groups(0); + int block_y = get_group_id(1); int id = get_local_id(0), first; __local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2]; __local int weights[CTA_SIZE2]; - int block_data_start = mad24(block_y, nblocksx, block_x) * SEARCH_SIZE_SQ * (TEMPLATE_SIZE + BLOCK_COLS); - __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); - __global int_t * up_col_dists = (__global int_t *)(buffer + sizeof(int_t) * (block_data_start + SEARCH_SIZE_SQ * TEMPLATE_SIZE)); + int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); + int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); - if (block_x < nblocksx && block_y < nblocksy) - { - int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); - int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); + // for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element + // and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row + int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE); + __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); + __global int_t * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; - for (int y = y0; y < y1; ++y) - for (int x = x0; x < x1; ++x) + for (int y = y0; y < y1; ++y) + for (int x = x0; x < x1; ++x) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (x == x0) { - if (x == x0) - { - calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); - first = 0; - } + calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); + first = 0; + } + else + { + if (y == y0) + calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); else { - if (y == y0) - calcElementInFirstRow(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); - else - { - calcElement(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); - first = (first + 1) % TEMPLATE_SIZE; - } - - convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, - y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); + first = (first + 1) % TEMPLATE_SIZE; } } - } + + barrier(CLK_LOCAL_MEM_FENCE); + + convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, + y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + } } #endif