|
|
|
@ -43,6 +43,16 @@ |
|
|
|
|
// |
|
|
|
|
//M*/ |
|
|
|
|
|
|
|
|
|
#if cn != 3 |
|
|
|
|
#define loadpix(addr) *(__global const T *)(addr) |
|
|
|
|
#define storepix(val, addr) *(__global T *)(addr) = val |
|
|
|
|
#define TSIZE (int)sizeof(T) |
|
|
|
|
#else |
|
|
|
|
#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) |
|
|
|
|
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) |
|
|
|
|
#define TSIZE ((int)sizeof(T1)*3) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define LDS_STEP TILE_DIM |
|
|
|
|
|
|
|
|
|
__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, |
|
|
|
@ -74,17 +84,16 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off |
|
|
|
|
int x_index = mad24(groupId_y, TILE_DIM, lx); |
|
|
|
|
int y_index = mad24(groupId_x, TILE_DIM, ly); |
|
|
|
|
|
|
|
|
|
__local T title[TILE_DIM * LDS_STEP]; |
|
|
|
|
__local T tile[TILE_DIM * LDS_STEP]; |
|
|
|
|
|
|
|
|
|
if (x < src_cols && y < src_rows) |
|
|
|
|
{ |
|
|
|
|
int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); |
|
|
|
|
int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) |
|
|
|
|
if (y + i < src_rows) |
|
|
|
|
{ |
|
|
|
|
__global const T * src = (__global const T *)(srcptr + index_src); |
|
|
|
|
title[mad24(ly + i, LDS_STEP, lx)] = src[0]; |
|
|
|
|
tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src); |
|
|
|
|
index_src = mad24(BLOCK_ROWS, src_step, index_src); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -92,13 +101,12 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off |
|
|
|
|
|
|
|
|
|
if (x_index < src_rows && y_index < src_cols) |
|
|
|
|
{ |
|
|
|
|
int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset)); |
|
|
|
|
int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset)); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) |
|
|
|
|
if ((y_index + i) < src_cols) |
|
|
|
|
{ |
|
|
|
|
__global T * dst = (__global T *)(dstptr + index_dst); |
|
|
|
|
dst[0] = title[mad24(lx, LDS_STEP, ly + i)]; |
|
|
|
|
storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst); |
|
|
|
|
index_dst = mad24(BLOCK_ROWS, dst_step, index_dst); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -111,14 +119,14 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o |
|
|
|
|
|
|
|
|
|
if (y < src_rows && x < y) |
|
|
|
|
{ |
|
|
|
|
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); |
|
|
|
|
int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset)); |
|
|
|
|
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); |
|
|
|
|
|
|
|
|
|
__global T * src = (__global T *)(srcptr + src_index); |
|
|
|
|
__global T * dst = (__global T *)(srcptr + dst_index); |
|
|
|
|
__global const uchar * src = srcptr + src_index; |
|
|
|
|
__global uchar * dst = srcptr + dst_index; |
|
|
|
|
|
|
|
|
|
T tmp = dst[0]; |
|
|
|
|
dst[0] = src[0]; |
|
|
|
|
src[0] = tmp; |
|
|
|
|
T tmp = loadpix(dst); |
|
|
|
|
storepix(loadpix(src), dst); |
|
|
|
|
storepix(tmp, src); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|