|
|
|
@ -53,7 +53,7 @@ |
|
|
|
|
#define TSIZE ((int)sizeof(T1)*3) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define LDS_STEP TILE_DIM |
|
|
|
|
#define LDS_STEP (TILE_DIM + 1) |
|
|
|
|
|
|
|
|
|
__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, |
|
|
|
|
__global uchar * dstptr, int dst_step, int dst_offset) |
|
|
|
@ -90,6 +90,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off |
|
|
|
|
{ |
|
|
|
|
int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) |
|
|
|
|
if (y + i < src_rows) |
|
|
|
|
{ |
|
|
|
@ -103,6 +104,7 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off |
|
|
|
|
{ |
|
|
|
|
int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset)); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) |
|
|
|
|
if ((y_index + i) < src_cols) |
|
|
|
|
{ |
|
|
|
@ -115,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off |
|
|
|
|
__kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows) |
|
|
|
|
{ |
|
|
|
|
int x = get_global_id(0); |
|
|
|
|
int y = get_global_id(1); |
|
|
|
|
int y = get_global_id(1) * rowsPerWI; |
|
|
|
|
|
|
|
|
|
if (y < src_rows && x < y) |
|
|
|
|
if (x < y + rowsPerWI) |
|
|
|
|
{ |
|
|
|
|
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); |
|
|
|
|
int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); |
|
|
|
|
T tmp; |
|
|
|
|
|
|
|
|
|
__global const uchar * src = srcptr + src_index; |
|
|
|
|
__global uchar * dst = srcptr + dst_index; |
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE) |
|
|
|
|
if (y < src_rows && x < y) |
|
|
|
|
{ |
|
|
|
|
__global uchar * src = srcptr + src_index; |
|
|
|
|
__global uchar * dst = srcptr + dst_index; |
|
|
|
|
|
|
|
|
|
T tmp = loadpix(dst); |
|
|
|
|
storepix(loadpix(src), dst); |
|
|
|
|
storepix(tmp, src); |
|
|
|
|
tmp = loadpix(dst); |
|
|
|
|
storepix(loadpix(src), dst); |
|
|
|
|
storepix(tmp, src); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|