mirror of https://github.com/opencv/opencv.git
Merge pull request #17764 from alalek:issue_17762
commit
09f24a851e
4 changed files with 624 additions and 107 deletions
@ -0,0 +1,95 @@ |
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include "perf_precomp.hpp" |
||||||
|
#include <opencv2/dnn/shape_utils.hpp> |
||||||
|
|
||||||
|
namespace opencv_test { |
||||||
|
|
||||||
|
struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> > |
||||||
|
{ |
||||||
|
template<int DIMS> |
||||||
|
void test_slice(const int* inputShape, const int* begin, const int* end) |
||||||
|
{ |
||||||
|
int backendId = get<0>(GetParam()); |
||||||
|
int targetId = get<1>(GetParam()); |
||||||
|
|
||||||
|
Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0)); |
||||||
|
for (int i = 0; i < (int)input.total(); ++i) |
||||||
|
input.ptr<float>()[i] = (float)(i & 4095); |
||||||
|
|
||||||
|
std::vector<Range> range(DIMS); |
||||||
|
for (int i = 0; i < DIMS; ++i) |
||||||
|
range[i] = Range(begin[i], end[i]); |
||||||
|
|
||||||
|
Net net; |
||||||
|
LayerParams lp; |
||||||
|
lp.type = "Slice"; |
||||||
|
lp.name = "testLayer"; |
||||||
|
lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS)); |
||||||
|
lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS)); |
||||||
|
net.addLayerToPrev(lp.name, lp.type, lp); |
||||||
|
|
||||||
|
// warmup
|
||||||
|
{ |
||||||
|
net.setInput(input); |
||||||
|
net.setPreferableBackend(backendId); |
||||||
|
net.setPreferableTarget(targetId); |
||||||
|
Mat out = net.forward(); |
||||||
|
|
||||||
|
EXPECT_GT(cv::norm(out, NORM_INF), 0); |
||||||
|
#if 0 |
||||||
|
//normAssert(out, input(range));
|
||||||
|
cout << input(range).clone().reshape(1, 1) << endl; |
||||||
|
cout << out.reshape(1, 1) << endl; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
TEST_CYCLE() |
||||||
|
{ |
||||||
|
Mat res = net.forward(); |
||||||
|
} |
||||||
|
|
||||||
|
SANITY_CHECK_NOTHING(); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1) |
||||||
|
{ |
||||||
|
const int inputShape[4] = {1, 64, 104, 104}; |
||||||
|
const int begin[] = {0, 32, 0, 0}; |
||||||
|
const int end[] = {1, 64, 104, 104}; |
||||||
|
test_slice<4>(inputShape, begin, end); |
||||||
|
} |
||||||
|
|
||||||
|
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2) |
||||||
|
{ |
||||||
|
const int inputShape[4] = {1, 128, 52, 52}; |
||||||
|
const int begin[] = {0, 64, 0, 0}; |
||||||
|
const int end[] = {1, 128, 52, 52}; |
||||||
|
test_slice<4>(inputShape, begin, end); |
||||||
|
} |
||||||
|
|
||||||
|
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3) |
||||||
|
{ |
||||||
|
const int inputShape[4] = {1, 256, 26, 26}; |
||||||
|
const int begin[] = {0, 128, 0, 0}; |
||||||
|
const int end[] = {1, 256, 26, 26}; |
||||||
|
test_slice<4>(inputShape, begin, end); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16) |
||||||
|
{ |
||||||
|
const int inputShape[4] = {1, 128, 80, 100}; |
||||||
|
const int begin[] = {0, 0, 2, 2}; |
||||||
|
const int end[] = {1, 128, 76, 96}; |
||||||
|
test_slice<4>(inputShape, begin, end); |
||||||
|
} |
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false)); |
||||||
|
|
||||||
|
} // namespace
|
@ -1,81 +1,283 @@ |
|||||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
// This file is part of OpenCV project. |
||||||
// |
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
// of this distribution and at http://opencv.org/license.html. |
||||||
// |
|
||||||
// By downloading, copying, installing or using the software you agree to this license. |
// Copyright (C) 2020, Intel Corporation, all rights reserved. |
||||||
// If you do not agree to this license, do not download, install, |
|
||||||
// copy or use the software. |
|
||||||
// |
|
||||||
// |
|
||||||
// License Agreement |
|
||||||
// For Open Source Computer Vision Library |
|
||||||
// |
|
||||||
// Copyright (C) 2017, Intel Corporation, all rights reserved. |
|
||||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. |
|
||||||
// Third party copyrights are property of their respective owners. |
// Third party copyrights are property of their respective owners. |
||||||
// |
|
||||||
// Redistribution and use in source and binary forms, with or without modification, |
/* |
||||||
// are permitted provided that the following conditions are met: |
Specialization constants: |
||||||
// |
- WSZ: size of OpenCL local group |
||||||
// * Redistribution's of source code must retain the above copyright notice, |
- DIMS: number of working dimensions |
||||||
// this list of conditions and the following disclaimer. |
- ELEMSIZE: element size in bytes |
||||||
// |
- DST_SZ_<i>: dst sizes |
||||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
- SRC_START_<i>: src index shift (slice .start value) |
||||||
// this list of conditions and the following disclaimer in the documentation |
- SRC_STEP_<i>: src steps (bytes) |
||||||
// and/or other materials provided with the distribution. |
- DST_STEP_<i>: dst steps (bytes), derived from DST_SZ_<i> and ELEMSIZE |
||||||
// |
- BLOCK_DIMS: number of dims for copy block (argmax(count(SRC_STEP_<i> != DST_STEP_<i>) <= 1)) |
||||||
// * The name of the copyright holders may not be used to endorse or promote products |
- BLOCK_DIMS_CONTIGUOUS (<= BLOCK_DIMS): SRC_STEP_<i> == DST_STEP_<i> for i in [0, BLOCK_DIMS_CONTIGUOUS) |
||||||
// derived from this software without specific prior written permission. |
|
||||||
// |
derived specialization constants: |
||||||
// This software is provided by the copyright holders and contributors "as is" and |
- BLOCK_SIZE: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS) |
||||||
// any express or implied warranties, including, but not limited to, the implied |
|
||||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
- USE_COPY_1D iff BLOCK_DIMS == BLOCK_DIMS_CONTIGUOUS |
||||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
- BLOCK_COLS: |
||||||
// indirect, incidental, special, exemplary, or consequential damages |
* with USE_COPY_1D: BLOCK_SIZE |
||||||
// (including, but not limited to, procurement of substitute goods or services; |
* w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS) |
||||||
// loss of use, data, or profits; or business interruption) however caused |
- BLOCK_ROWS: |
||||||
// and on any theory of liability, whether in contract, strict liability, |
* with USE_COPY_1D: N/A |
||||||
// or tort (including negligence or otherwise) arising in any way out of |
* w/o USE_COPY_1D: ELEMSIZE * mul(DST_SZ_<i>) for i in [BLOCK_DIMS_CONTIGUOUS, BLOCK_DIMS) |
||||||
// the use of this software, even if advised of the possibility of such damage. |
- BLOCK_SRC_STRIDE: |
||||||
// |
* with USE_COPY_1D: N/A |
||||||
//M*/ |
* w/o USE_COPY_1D: ELEMSIZE * mul(SRC_STEP_<i>) for i in [0, BLOCK_DIMS_CONTIGUOUS) |
||||||
|
|
||||||
#if defined(cl_khr_fp16) |
Note: SZ, STEP values are in reversed order than OpenCV Mat: |
||||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
- NCHW SZ: [cols, rows, channels, batch] |
||||||
#endif |
- NCHW STEP: [elemsize, cols * elemsize, rows * cols * elemsize, ...] (DIMS+1 value) |
||||||
|
|
||||||
__kernel void slice(__global const Dtype* src, |
*/ |
||||||
const int src_plane_size, |
|
||||||
const int dst_plane_size, |
/* |
||||||
const int src_cols, |
local: <WSZ, 1, 1> |
||||||
const int dst_cols, |
global: <WSZ, number_of_copy_blocks, 1> |
||||||
const int row_offset, |
*/ |
||||||
const int col_offset, |
|
||||||
__global Dtype* dst) |
#define CONCAT_(A, B) A##B |
||||||
|
#define CONCAT(A, B) CONCAT_(A, B) |
||||||
|
|
||||||
|
#define BLOCK_COLS_X4 (BLOCK_COLS / 4) |
||||||
|
#define BLOCK_COLS_X16 (BLOCK_COLS / 16) |
||||||
|
|
||||||
|
#ifdef USE_COPY_1D |
||||||
|
|
||||||
|
static inline |
||||||
|
__attribute__((always_inline)) |
||||||
|
void copy_block_1d( |
||||||
|
__global const uchar* src0, |
||||||
|
const uint src_offset, |
||||||
|
__global uchar* dst0, |
||||||
|
const uint dst_offset |
||||||
|
) |
||||||
{ |
{ |
||||||
unsigned int row_gid = get_group_id(0); |
__global const uchar* src = src0 + src_offset; |
||||||
unsigned int lid = get_local_id(0); |
__global uchar* dst = dst0 + dst_offset; |
||||||
const __global Dtype *src_read = src + row_gid * 4 * src_plane_size; |
|
||||||
__global Dtype *dst_read = dst + row_gid * 4 * dst_plane_size; |
uint processed = 0; |
||||||
Dtype4 a0, a1, a2, a3; |
|
||||||
|
#if BLOCK_COLS_X16 >= 4 |
||||||
int i = lid; |
|
||||||
while( i < dst_plane_size / 4) |
|
||||||
{ |
{ |
||||||
int row = (4 * i) / dst_cols + row_offset; |
// uchar16 x 4rows per iteration |
||||||
int col = (4 * i) % dst_cols + col_offset; |
uint i = get_local_id(0) * 16; // uchar16 |
||||||
int src_index = row * src_cols + col; |
while (i < BLOCK_COLS_X16 * 16) |
||||||
|
{ |
||||||
|
uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ); |
||||||
|
idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16)); |
||||||
|
|
||||||
a0 = vload4(0, src_read + src_index); |
uchar16 a0 = vload16(0, src + idx.s0); |
||||||
a1 = vload4(0, src_read + src_index + src_plane_size); |
uchar16 a1 = vload16(0, src + idx.s1); |
||||||
a2 = vload4(0, src_read + src_index + 2 * src_plane_size); |
uchar16 a2 = vload16(0, src + idx.s2); |
||||||
a3 = vload4(0, src_read + src_index + 3 * src_plane_size); |
uchar16 a3 = vload16(0, src + idx.s3); |
||||||
|
|
||||||
vstore4(a0, i, dst_read); |
vstore16(a0, 0, dst + idx.s0); |
||||||
vstore4(a1, i, dst_read + dst_plane_size); |
vstore16(a1, 0, dst + idx.s1); |
||||||
vstore4(a2, i, dst_read + 2 * dst_plane_size); |
vstore16(a2, 0, dst + idx.s2); |
||||||
vstore4(a3, i, dst_read + 3 * dst_plane_size); |
vstore16(a3, 0, dst + idx.s3); |
||||||
|
|
||||||
i += get_local_size(0); |
i += WSZ * 16 * 4; |
||||||
|
} |
||||||
|
processed = BLOCK_COLS_X16 * 16; |
||||||
} |
} |
||||||
|
#else |
||||||
|
#define SKIP_1D_BLOCK_COLS_X16 1 |
||||||
|
#endif |
||||||
|
|
||||||
|
#if BLOCK_COLS_X4 > 0 && (defined(SKIP_1D_BLOCK_COLS_X16) || (BLOCK_COLS_X16 * 16 != BLOCK_COLS_X4 * 4)) |
||||||
|
{ |
||||||
|
// uchar4 x 4rows per iteration |
||||||
|
uint i = get_local_id(0) * 4 + processed; // uchar4 |
||||||
|
while (i < BLOCK_COLS_X4 * 4) |
||||||
|
{ |
||||||
|
uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ); |
||||||
|
idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4)); |
||||||
|
|
||||||
|
uchar4 a0 = vload4(0, src + idx.s0); |
||||||
|
uchar4 a1 = vload4(0, src + idx.s1); |
||||||
|
uchar4 a2 = vload4(0, src + idx.s2); |
||||||
|
uchar4 a3 = vload4(0, src + idx.s3); |
||||||
|
|
||||||
|
vstore4(a0, 0, dst + idx.s0); |
||||||
|
vstore4(a1, 0, dst + idx.s1); |
||||||
|
vstore4(a2, 0, dst + idx.s2); |
||||||
|
vstore4(a3, 0, dst + idx.s3); |
||||||
|
|
||||||
|
i += WSZ * 4 * 4; |
||||||
|
} |
||||||
|
processed = BLOCK_COLS_X4 * 4; |
||||||
|
} |
||||||
|
#else |
||||||
|
#define SKIP_1D_BLOCK_COLS_X4 1 |
||||||
|
#endif // BLOCK_COLS_X4 > 0 |
||||||
|
|
||||||
|
#if (defined(SKIP_1D_BLOCK_COLS_X16) && defined(SKIP_1D_BLOCK_COLS_X4)) || BLOCK_COLS_X4 * 4 != BLOCK_COLS |
||||||
|
{ |
||||||
|
uint i = get_local_id(0) + processed; |
||||||
|
while (i < BLOCK_COLS) |
||||||
|
{ |
||||||
|
uchar a0 = src[i]; |
||||||
|
dst[i] = a0; |
||||||
|
|
||||||
|
i += WSZ; |
||||||
|
} |
||||||
|
} |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
#else // USE_COPY_1D |
||||||
|
|
||||||
|
static inline |
||||||
|
__attribute__((always_inline)) |
||||||
|
void copy_block_2d( |
||||||
|
__global const uchar* src0, |
||||||
|
const uint src_offset0, |
||||||
|
__global uchar* dst0, |
||||||
|
const uint dst_offset0 |
||||||
|
) |
||||||
|
{ |
||||||
|
__global const uchar* src = src0 + src_offset0; |
||||||
|
__global uchar* dst = dst0 + dst_offset0; |
||||||
|
|
||||||
|
uint i = get_local_id(0) * 4; |
||||||
|
|
||||||
|
#define BLOCK_COLS_FILL_X4 (((BLOCK_COLS + 3) / 4) * 4) |
||||||
|
#define BLOCK_SIZE_FILL_X4 (BLOCK_COLS_FILL_X4 * BLOCK_ROWS) |
||||||
|
|
||||||
|
while (i < BLOCK_SIZE_FILL_X4) |
||||||
|
{ |
||||||
|
int row = i / BLOCK_COLS_FILL_X4; |
||||||
|
int col = i % BLOCK_COLS_FILL_X4; |
||||||
|
|
||||||
|
uint src_offset = row * BLOCK_SRC_STRIDE + col; |
||||||
|
#if BLOCK_COLS_FILL_X4 == BLOCK_COLS |
||||||
|
uint dst_offset = i; |
||||||
|
#else |
||||||
|
uint dst_offset = row * BLOCK_COLS + col; |
||||||
|
#endif |
||||||
|
|
||||||
|
#if BLOCK_COLS_FILL_X4 != BLOCK_COLS |
||||||
|
if (col <= BLOCK_COLS - 4) |
||||||
|
#endif |
||||||
|
{ |
||||||
|
uchar4 a = vload4(0, src + src_offset); |
||||||
|
vstore4(a, 0, dst + dst_offset); |
||||||
|
} |
||||||
|
#if BLOCK_COLS_FILL_X4 != BLOCK_COLS |
||||||
|
else |
||||||
|
{ |
||||||
|
/* non-optimized reference code |
||||||
|
while (col < BLOCK_COLS) |
||||||
|
{ |
||||||
|
uchar a = src[src_offset]; |
||||||
|
dst[dst_offset] = a; |
||||||
|
col++; |
||||||
|
src_offset++; |
||||||
|
dst_offset++; |
||||||
|
} |
||||||
|
*/ |
||||||
|
|
||||||
|
uint4 shift = (uint4)(0, 1, 2, 3); |
||||||
|
shift = select((uint4)0, shift, col + shift < BLOCK_COLS); |
||||||
|
|
||||||
|
dst[dst_offset + shift.s0] = src[src_offset + shift.s0]; |
||||||
|
|
||||||
|
#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 2 |
||||||
|
dst[dst_offset + shift.s1] = src[src_offset + shift.s1]; |
||||||
|
#endif |
||||||
|
#if BLOCK_COLS_FILL_X4 - BLOCK_COLS <= 1 |
||||||
|
dst[dst_offset + shift.s2] = src[src_offset + shift.s2]; |
||||||
|
#endif |
||||||
|
} |
||||||
|
#endif // BLOCK_COLS_FILL_X4 != BLOCK_COLS |
||||||
|
i += WSZ * 4; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#endif // USE_COPY_1D |
||||||
|
|
||||||
|
__kernel void |
||||||
|
CONCAT(slice_, DIMS)( |
||||||
|
__global const uchar* src, |
||||||
|
__global uchar* dst |
||||||
|
) |
||||||
|
{ |
||||||
|
uint block_id = get_global_id(1); |
||||||
|
|
||||||
|
uint dst_offset = block_id * BLOCK_SIZE; |
||||||
|
|
||||||
|
uint src_offset = 0; |
||||||
|
|
||||||
|
#define CALC_SRC_INDEX(dim) \ |
||||||
|
{ \ |
||||||
|
uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \ |
||||||
|
CONCAT(idx_, dim) = block_id / plane_sz; \ |
||||||
|
block_id = block_id - CONCAT(idx_, dim) * plane_sz; \ |
||||||
|
} |
||||||
|
#define UPDATE_SRC_OFFSET(dim) \ |
||||||
|
src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset); |
||||||
|
/* |
||||||
|
if (get_global_id(0) == 0 && get_global_id(1) == 0) \ |
||||||
|
printf("(%d, %d): @%d src_offset=%d idx_dim=%d block_id=%d\n", \ |
||||||
|
get_global_id(0), get_global_id(1), \ |
||||||
|
dim, src_offset, CONCAT(idx_, dim), block_id \ |
||||||
|
); |
||||||
|
*/ |
||||||
|
|
||||||
|
#if DIMS > 5 |
||||||
|
#error "invalid configuration" |
||||||
|
#endif |
||||||
|
#if DIMS > 4 |
||||||
|
uint idx_4 = 0; |
||||||
|
#if BLOCK_DIMS <= 4 |
||||||
|
CALC_SRC_INDEX(4) |
||||||
|
#endif |
||||||
|
UPDATE_SRC_OFFSET(4) |
||||||
|
#endif |
||||||
|
#if DIMS > 3 |
||||||
|
uint idx_3 = 0; |
||||||
|
#if BLOCK_DIMS <= 3 |
||||||
|
CALC_SRC_INDEX(3) |
||||||
|
#endif |
||||||
|
UPDATE_SRC_OFFSET(3) |
||||||
|
#endif |
||||||
|
#if DIMS > 2 |
||||||
|
uint idx_2 = 0; |
||||||
|
#if BLOCK_DIMS <= 2 |
||||||
|
CALC_SRC_INDEX(2) |
||||||
|
#endif |
||||||
|
UPDATE_SRC_OFFSET(2) |
||||||
|
#endif |
||||||
|
#if DIMS > 1 |
||||||
|
uint idx_1 = 0; |
||||||
|
#if BLOCK_DIMS <= 1 |
||||||
|
CALC_SRC_INDEX(1) |
||||||
|
#endif |
||||||
|
UPDATE_SRC_OFFSET(1) |
||||||
|
#endif |
||||||
|
#if DIMS > 0 |
||||||
|
uint idx_0 = 0; |
||||||
|
UPDATE_SRC_OFFSET(0) |
||||||
|
#endif |
||||||
|
|
||||||
|
/* |
||||||
|
if (get_global_id(0) == 0) |
||||||
|
printf("(%d, %d): src_offset=%d dst_offset=%d\n", |
||||||
|
get_global_id(0), get_global_id(1), |
||||||
|
src_offset, dst_offset |
||||||
|
); |
||||||
|
*/ |
||||||
|
|
||||||
|
#ifdef USE_COPY_1D |
||||||
|
copy_block_1d(src, src_offset, dst, dst_offset); |
||||||
|
#else |
||||||
|
copy_block_2d(src, src_offset, dst, dst_offset); |
||||||
|
#endif |
||||||
} |
} |
||||||
|
Loading…
Reference in new issue