|
|
|
@ -54,15 +54,29 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int |
|
|
|
|
int rows, int cols, int thread_rows, int thread_cols) |
|
|
|
|
{ |
|
|
|
|
int x = get_global_id(0); |
|
|
|
|
int y = get_global_id(1); |
|
|
|
|
int y0 = get_global_id(1)*PIX_PER_WI_Y; |
|
|
|
|
|
|
|
|
|
if (x < cols && y < thread_rows) |
|
|
|
|
if (x < cols) |
|
|
|
|
{ |
|
|
|
|
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); |
|
|
|
|
T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); |
|
|
|
|
int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); |
|
|
|
|
int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset)); |
|
|
|
|
|
|
|
|
|
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); |
|
|
|
|
storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); |
|
|
|
|
#pragma unroll |
|
|
|
|
for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y) |
|
|
|
|
{ |
|
|
|
|
T src0 = loadpix(srcptr + src_index0); |
|
|
|
|
T src1 = loadpix(srcptr + src_index1); |
|
|
|
|
|
|
|
|
|
storepix(src1, dstptr + dst_index0); |
|
|
|
|
storepix(src0, dstptr + dst_index1); |
|
|
|
|
|
|
|
|
|
src_index0 += src_step; |
|
|
|
|
src_index1 -= src_step; |
|
|
|
|
dst_index0 += dst_step; |
|
|
|
|
dst_index1 -= dst_step; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -71,16 +85,29 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, |
|
|
|
|
int rows, int cols, int thread_rows, int thread_cols) |
|
|
|
|
{ |
|
|
|
|
int x = get_global_id(0); |
|
|
|
|
int y = get_global_id(1); |
|
|
|
|
int y0 = get_global_id(1)*PIX_PER_WI_Y; |
|
|
|
|
|
|
|
|
|
if (x < cols && y < thread_rows) |
|
|
|
|
if (x < cols) |
|
|
|
|
{ |
|
|
|
|
int x1 = cols - x - 1; |
|
|
|
|
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); |
|
|
|
|
T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); |
|
|
|
|
int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset)); |
|
|
|
|
int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); |
|
|
|
|
int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset)); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y) |
|
|
|
|
{ |
|
|
|
|
T src0 = loadpix(srcptr + src_index0); |
|
|
|
|
T src1 = loadpix(srcptr + src_index1); |
|
|
|
|
|
|
|
|
|
storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); |
|
|
|
|
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); |
|
|
|
|
storepix(src1, dstptr + dst_index0); |
|
|
|
|
storepix(src0, dstptr + dst_index1); |
|
|
|
|
|
|
|
|
|
src_index0 += src_step; |
|
|
|
|
src_index1 -= src_step; |
|
|
|
|
dst_index0 += dst_step; |
|
|
|
|
dst_index1 -= dst_step; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -89,15 +116,28 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int |
|
|
|
|
int rows, int cols, int thread_rows, int thread_cols) |
|
|
|
|
{ |
|
|
|
|
int x = get_global_id(0); |
|
|
|
|
int y = get_global_id(1); |
|
|
|
|
int y0 = get_global_id(1)*PIX_PER_WI_Y; |
|
|
|
|
|
|
|
|
|
if (x < thread_cols && y < rows) |
|
|
|
|
if (x < thread_cols) |
|
|
|
|
{ |
|
|
|
|
int x1 = cols - x - 1; |
|
|
|
|
T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); |
|
|
|
|
T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); |
|
|
|
|
int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset)); |
|
|
|
|
int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); |
|
|
|
|
int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset)); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y) |
|
|
|
|
{ |
|
|
|
|
T src0 = loadpix(srcptr + src_index0); |
|
|
|
|
T src1 = loadpix(srcptr + src_index1); |
|
|
|
|
|
|
|
|
|
storepix(src1, dstptr + dst_index0); |
|
|
|
|
storepix(src0, dstptr + dst_index1); |
|
|
|
|
|
|
|
|
|
storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); |
|
|
|
|
storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); |
|
|
|
|
src_index0 += src_step; |
|
|
|
|
src_index1 += src_step; |
|
|
|
|
dst_index0 += dst_step; |
|
|
|
|
dst_index1 += dst_step; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|