Merge pull request #7789 from pengli:gaussian_blur
commit
d37106e008
3 changed files with 237 additions and 16 deletions
@ -0,0 +1,198 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#define DIG(a) a, |
||||
__constant float kx[] = { KERNEL_MATRIX_X }; |
||||
__constant float ky[] = { KERNEL_MATRIX_Y }; |
||||
|
||||
#define OP(y, x) (convert_float4(arr[y * 5 + x]) * ky[y] * kx[x]) |
||||
|
||||
#define FILL_ARR(s1, s2, n, e1, e2) \ |
||||
arr[5 * n + 0] = row_s ? (uchar4)(s1, s2, line[n].s23) : (uchar4)(line[n].s0123); \ |
||||
arr[5 * n + 1] = row_s ? (uchar4)(s2, line[n].s234) : (uchar4)(line[n].s1234); \ |
||||
arr[5 * n + 2] = (uchar4)(line[n].s2345); \ |
||||
arr[5 * n + 3] = row_e ? (uchar4)(line[n].s345, e1) : (uchar4)(line[n].s3456); \ |
||||
arr[5 * n + 4] = row_e ? (uchar4)(line[n].s45, e1, e2) : (uchar4)(line[n].s4567); |
||||
|
||||
__kernel void gaussianBlur5x5_8UC1_cols4(__global const uchar* src, int src_step, |
||||
__global uint* dst, int dst_step, int rows, int cols) |
||||
{ |
||||
int x = get_global_id(0) * 4; |
||||
int y = get_global_id(1); |
||||
|
||||
if (x >= cols || y >= rows) return; |
||||
|
||||
uchar8 line[5]; |
||||
int offset, src_index; |
||||
|
||||
src_index = x + (y - 2) * src_step - 2; |
||||
offset = max(0, src_index + 2 * src_step); |
||||
line[2] = vload8(0, src + offset); |
||||
if (offset == 0) line[2] = (uchar8)(0, 0, line[2].s0123, line[2].s45); |
||||
|
||||
#if defined BORDER_CONSTANT || defined BORDER_REPLICATE |
||||
uchar8 tmp; |
||||
#ifdef BORDER_CONSTANT |
||||
tmp = (uchar8)0; |
||||
#elif defined BORDER_REPLICATE |
||||
tmp = line[2]; |
||||
#endif |
||||
line[0] = line[1] = tmp; |
||||
if (y > 1) |
||||
{ |
||||
offset = max(0, src_index); |
||||
line[0] = vload8(0, src + offset); |
||||
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); |
||||
} |
||||
|
||||
if (y > 0) |
||||
{ |
||||
offset = max(0, src_index + src_step); |
||||
line[1] = vload8(0, src + offset); |
||||
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45); |
||||
} |
||||
|
||||
line[3] = (y == (rows - 1)) ? tmp : vload8(0, src + src_index + 3 * src_step); |
||||
line[4] = (y >= (rows - 2)) ? tmp : vload8(0, src + src_index + 4 * src_step); |
||||
#elif BORDER_REFLECT |
||||
int t; |
||||
t = (y <= 1) ? (abs(y - 1) - y + 2) : 0; |
||||
offset = max(0, src_index + t * src_step); |
||||
line[0] = vload8(0, src + offset); |
||||
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); |
||||
|
||||
if (y == 0) |
||||
line[1] = line[2]; |
||||
else |
||||
{ |
||||
offset = max(0, src_index + 1 * src_step); |
||||
line[1] = vload8(0, src + offset); |
||||
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[0].s45); |
||||
} |
||||
|
||||
line[3] = (y == (rows - 1)) ? line[2] : vload8(0, src + src_index + 3 * src_step); |
||||
|
||||
t = (y >= (rows - 2)) ? (abs(y - (rows - 1)) - (y - (rows - 2)) + 2) : 4; |
||||
line[4] = vload8(0, src + src_index + t * src_step); |
||||
#elif BORDER_REFLECT_101 |
||||
if (y == 1) |
||||
line[0] = line[2]; |
||||
else |
||||
{ |
||||
offset = (y == 0) ? (src_index + 4 * src_step) : max(0, src_index); |
||||
line[0] = vload8(0, src + offset); |
||||
if (offset == 0) line[0] = (uchar8)(0, 0, line[0].s0123, line[0].s45); |
||||
} |
||||
|
||||
offset = (y == 0) ? (src_index + 3 * src_step) : max(0, src_index + 1 * src_step); |
||||
line[1] = vload8(0, src + offset); |
||||
if (offset == 0) line[1] = (uchar8)(0, 0, line[1].s0123, line[1].s45); |
||||
|
||||
line[3] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 3) * src_step); |
||||
if (y == (rows - 2)) |
||||
line[4] = line[2]; |
||||
else |
||||
{ |
||||
line[4] = vload8(0, src + src_index + ((y == (rows - 1)) ? 1 : 4) * src_step); |
||||
} |
||||
#endif |
||||
|
||||
bool row_s = (x == 0); |
||||
bool row_e = ((x + 4) == cols); |
||||
uchar4 arr[25]; |
||||
uchar s, e; |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
s = e = 0; |
||||
|
||||
FILL_ARR(s, s, 0, e, e); |
||||
FILL_ARR(s, s, 1, e, e); |
||||
FILL_ARR(s, s, 2, e, e); |
||||
FILL_ARR(s, s, 3, e, e); |
||||
FILL_ARR(s, s, 4, e, e); |
||||
#elif defined BORDER_REPLICATE |
||||
s = line[0].s2; |
||||
e = line[0].s5; |
||||
FILL_ARR(s, s, 0, e, e); |
||||
|
||||
s = line[1].s2; |
||||
e = line[1].s5; |
||||
FILL_ARR(s, s, 1, e, e); |
||||
|
||||
s = line[2].s2; |
||||
e = line[2].s5; |
||||
FILL_ARR(s, s, 2, e, e); |
||||
|
||||
s = line[3].s2; |
||||
e = line[3].s5; |
||||
FILL_ARR(s, s, 3, e, e); |
||||
|
||||
s = line[4].s2; |
||||
e = line[4].s5; |
||||
FILL_ARR(s, s, 4, e, e); |
||||
#elif BORDER_REFLECT |
||||
uchar s1, s2; |
||||
uchar e1, e2; |
||||
|
||||
s1 = line[0].s3; |
||||
s2 = line[0].s2; |
||||
e1 = line[0].s5; |
||||
e2 = line[0].s4; |
||||
FILL_ARR(s1, s2, 0, e1, e2); |
||||
|
||||
s1 = line[1].s3; |
||||
s2 = line[1].s2; |
||||
e1 = line[1].s5; |
||||
e2 = line[1].s4; |
||||
FILL_ARR(s1, s2, 1, e1, e2); |
||||
|
||||
s1 = line[2].s3; |
||||
s2 = line[2].s2; |
||||
e1 = line[2].s5; |
||||
e2 = line[2].s4; |
||||
FILL_ARR(s1, s2, 2, e1, e2); |
||||
|
||||
s1 = line[3].s3; |
||||
s2 = line[3].s2; |
||||
e1 = line[3].s5; |
||||
e2 = line[3].s4; |
||||
FILL_ARR(s1, s2, 3, e1, e2); |
||||
|
||||
s1 = line[4].s3; |
||||
s2 = line[4].s2; |
||||
e1 = line[4].s5; |
||||
e2 = line[4].s4; |
||||
FILL_ARR(s1, s2, 4, e1, e2); |
||||
#elif BORDER_REFLECT_101 |
||||
s = line[0].s4; |
||||
e = line[0].s3; |
||||
FILL_ARR(s, e, 0, s, e); |
||||
|
||||
s = line[1].s4; |
||||
e = line[1].s3; |
||||
FILL_ARR(s, e, 1, s, e); |
||||
|
||||
s = line[2].s4; |
||||
e = line[2].s3; |
||||
FILL_ARR(s, e, 2, s, e); |
||||
|
||||
s = line[3].s4; |
||||
e = line[3].s3; |
||||
FILL_ARR(s, e, 3, s, e); |
||||
|
||||
s = line[4].s4; |
||||
e = line[4].s3; |
||||
FILL_ARR(s, e, 4, s, e); |
||||
#endif |
||||
|
||||
float4 sum; |
||||
sum = OP(0, 0) + OP(0, 1) + OP(0, 2) + OP(0, 3) + OP(0, 4) + |
||||
OP(1, 0) + OP(1, 1) + OP(1, 2) + OP(1, 3) + OP(1, 4) + |
||||
OP(2, 0) + OP(2, 1) + OP(2, 2) + OP(2, 3) + OP(2, 4) + |
||||
OP(3, 0) + OP(3, 1) + OP(3, 2) + OP(3, 3) + OP(3, 4) + |
||||
OP(4, 0) + OP(4, 1) + OP(4, 2) + OP(4, 3) + OP(4, 4); |
||||
|
||||
int dst_index = (x / 4) + y * (dst_step / 4); |
||||
dst[dst_index] = as_uint(convert_uchar4_sat_rte(sum)); |
||||
} |
Loading…
Reference in new issue