Merge pull request #12565 from dkurt:dnn_non_intel_gpu

* Remove isIntel check from deep learning layers

* Remove fp16->fp32 fallbacks where it's not necessary

* Fix Kernel::run to prevent localsize > globalsize
pull/12660/head
Dmitry Kurtaev 6 years ago committed by Alexander Alekhin
parent c8f3579f93
commit 24ab751547
  1. 150
      modules/core/include/opencv2/core/ocl.hpp
  2. 2
      modules/core/src/ocl.cpp
  3. 15
      modules/core/test/ocl/test_gemm.cpp
  4. 20
      modules/dnn/src/dnn.cpp
  5. 3
      modules/dnn/src/layers/batch_norm_layer.cpp
  6. 9
      modules/dnn/src/layers/blank_layer.cpp
  7. 9
      modules/dnn/src/layers/concat_layer.cpp
  8. 3
      modules/dnn/src/layers/convolution_layer.cpp
  9. 6
      modules/dnn/src/layers/crop_layer.cpp
  10. 3
      modules/dnn/src/layers/detection_output_layer.cpp
  11. 3
      modules/dnn/src/layers/eltwise_layer.cpp
  12. 9
      modules/dnn/src/layers/flatten_layer.cpp
  13. 3
      modules/dnn/src/layers/fully_connected_layer.cpp
  14. 3
      modules/dnn/src/layers/lrn_layer.cpp
  15. 3
      modules/dnn/src/layers/normalize_bbox_layer.cpp
  16. 16
      modules/dnn/src/layers/padding_layer.cpp
  17. 3
      modules/dnn/src/layers/permute_layer.cpp
  18. 3
      modules/dnn/src/layers/prior_box_layer.cpp
  19. 3
      modules/dnn/src/layers/region_layer.cpp
  20. 3
      modules/dnn/src/layers/reorg_layer.cpp
  21. 9
      modules/dnn/src/layers/reshape_layer.cpp
  22. 3
      modules/dnn/src/layers/shuffle_channel_layer.cpp
  23. 9
      modules/dnn/src/layers/slice_layer.cpp
  24. 3
      modules/dnn/src/layers/softmax_layer.cpp
  25. 6
      modules/dnn/src/layers/split_layer.cpp
  26. 3
      modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
  27. 2
      modules/dnn/src/opencl/prior_box.cl
  28. 4
      modules/dnn/test/test_onnx_importer.cpp
  29. 2
      modules/python/test/test_dnn.py

@ -59,7 +59,7 @@ CV_EXPORTS_W void finish();
CV_EXPORTS bool haveSVM();
class CV_EXPORTS Context;
class CV_EXPORTS Device;
class CV_EXPORTS_W_SIMPLE Device;
class CV_EXPORTS Kernel;
class CV_EXPORTS Program;
class CV_EXPORTS ProgramSource;
@ -67,14 +67,14 @@ class CV_EXPORTS Queue;
class CV_EXPORTS PlatformInfo;
class CV_EXPORTS Image2D;
class CV_EXPORTS Device
class CV_EXPORTS_W_SIMPLE Device
{
public:
Device();
CV_WRAP Device();
explicit Device(void* d);
Device(const Device& d);
Device& operator = (const Device& d);
~Device();
CV_WRAP ~Device();
void set(void* d);
@ -89,24 +89,24 @@ public:
TYPE_ALL = 0xFFFFFFFF
};
String name() const;
String extensions() const;
bool isExtensionSupported(const String& extensionName) const;
String version() const;
String vendorName() const;
String OpenCL_C_Version() const;
String OpenCLVersion() const;
int deviceVersionMajor() const;
int deviceVersionMinor() const;
String driverVersion() const;
CV_WRAP String name() const;
CV_WRAP String extensions() const;
CV_WRAP bool isExtensionSupported(const String& extensionName) const;
CV_WRAP String version() const;
CV_WRAP String vendorName() const;
CV_WRAP String OpenCL_C_Version() const;
CV_WRAP String OpenCLVersion() const;
CV_WRAP int deviceVersionMajor() const;
CV_WRAP int deviceVersionMinor() const;
CV_WRAP String driverVersion() const;
void* ptr() const;
int type() const;
CV_WRAP int type() const;
int addressBits() const;
bool available() const;
bool compilerAvailable() const;
bool linkerAvailable() const;
CV_WRAP int addressBits() const;
CV_WRAP bool available() const;
CV_WRAP bool compilerAvailable() const;
CV_WRAP bool linkerAvailable() const;
enum
{
@ -119,21 +119,21 @@ public:
FP_SOFT_FLOAT=(1 << 6),
FP_CORRECTLY_ROUNDED_DIVIDE_SQRT=(1 << 7)
};
int doubleFPConfig() const;
int singleFPConfig() const;
int halfFPConfig() const;
CV_WRAP int doubleFPConfig() const;
CV_WRAP int singleFPConfig() const;
CV_WRAP int halfFPConfig() const;
bool endianLittle() const;
bool errorCorrectionSupport() const;
CV_WRAP bool endianLittle() const;
CV_WRAP bool errorCorrectionSupport() const;
enum
{
EXEC_KERNEL=(1 << 0),
EXEC_NATIVE_KERNEL=(1 << 1)
};
int executionCapabilities() const;
CV_WRAP int executionCapabilities() const;
size_t globalMemCacheSize() const;
CV_WRAP size_t globalMemCacheSize() const;
enum
{
@ -141,38 +141,38 @@ public:
READ_ONLY_CACHE=1,
READ_WRITE_CACHE=2
};
int globalMemCacheType() const;
int globalMemCacheLineSize() const;
size_t globalMemSize() const;
CV_WRAP int globalMemCacheType() const;
CV_WRAP int globalMemCacheLineSize() const;
CV_WRAP size_t globalMemSize() const;
size_t localMemSize() const;
CV_WRAP size_t localMemSize() const;
enum
{
NO_LOCAL_MEM=0,
LOCAL_IS_LOCAL=1,
LOCAL_IS_GLOBAL=2
};
int localMemType() const;
bool hostUnifiedMemory() const;
CV_WRAP int localMemType() const;
CV_WRAP bool hostUnifiedMemory() const;
bool imageSupport() const;
CV_WRAP bool imageSupport() const;
bool imageFromBufferSupport() const;
CV_WRAP bool imageFromBufferSupport() const;
uint imagePitchAlignment() const;
uint imageBaseAddressAlignment() const;
/// deprecated, use isExtensionSupported() method (probably with "cl_khr_subgroups" value)
bool intelSubgroupsSupport() const;
CV_WRAP bool intelSubgroupsSupport() const;
size_t image2DMaxWidth() const;
size_t image2DMaxHeight() const;
CV_WRAP size_t image2DMaxWidth() const;
CV_WRAP size_t image2DMaxHeight() const;
size_t image3DMaxWidth() const;
size_t image3DMaxHeight() const;
size_t image3DMaxDepth() const;
CV_WRAP size_t image3DMaxWidth() const;
CV_WRAP size_t image3DMaxHeight() const;
CV_WRAP size_t image3DMaxDepth() const;
size_t imageMaxBufferSize() const;
size_t imageMaxArraySize() const;
CV_WRAP size_t imageMaxBufferSize() const;
CV_WRAP size_t imageMaxArraySize() const;
enum
{
@ -181,53 +181,53 @@ public:
VENDOR_INTEL=2,
VENDOR_NVIDIA=3
};
int vendorID() const;
CV_WRAP int vendorID() const;
// FIXIT
// dev.isAMD() doesn't work for OpenCL CPU devices from AMD OpenCL platform.
// This method should use platform name instead of vendor name.
// After fix restore code in arithm.cpp: ocl_compare()
inline bool isAMD() const { return vendorID() == VENDOR_AMD; }
inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }
inline bool isNVidia() const { return vendorID() == VENDOR_NVIDIA; }
CV_WRAP inline bool isAMD() const { return vendorID() == VENDOR_AMD; }
CV_WRAP inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }
CV_WRAP inline bool isNVidia() const { return vendorID() == VENDOR_NVIDIA; }
int maxClockFrequency() const;
int maxComputeUnits() const;
int maxConstantArgs() const;
size_t maxConstantBufferSize() const;
CV_WRAP int maxClockFrequency() const;
CV_WRAP int maxComputeUnits() const;
CV_WRAP int maxConstantArgs() const;
CV_WRAP size_t maxConstantBufferSize() const;
size_t maxMemAllocSize() const;
size_t maxParameterSize() const;
CV_WRAP size_t maxMemAllocSize() const;
CV_WRAP size_t maxParameterSize() const;
int maxReadImageArgs() const;
int maxWriteImageArgs() const;
int maxSamplers() const;
CV_WRAP int maxReadImageArgs() const;
CV_WRAP int maxWriteImageArgs() const;
CV_WRAP int maxSamplers() const;
size_t maxWorkGroupSize() const;
int maxWorkItemDims() const;
CV_WRAP size_t maxWorkGroupSize() const;
CV_WRAP int maxWorkItemDims() const;
void maxWorkItemSizes(size_t*) const;
int memBaseAddrAlign() const;
CV_WRAP int memBaseAddrAlign() const;
int nativeVectorWidthChar() const;
int nativeVectorWidthShort() const;
int nativeVectorWidthInt() const;
int nativeVectorWidthLong() const;
int nativeVectorWidthFloat() const;
int nativeVectorWidthDouble() const;
int nativeVectorWidthHalf() const;
CV_WRAP int nativeVectorWidthChar() const;
CV_WRAP int nativeVectorWidthShort() const;
CV_WRAP int nativeVectorWidthInt() const;
CV_WRAP int nativeVectorWidthLong() const;
CV_WRAP int nativeVectorWidthFloat() const;
CV_WRAP int nativeVectorWidthDouble() const;
CV_WRAP int nativeVectorWidthHalf() const;
int preferredVectorWidthChar() const;
int preferredVectorWidthShort() const;
int preferredVectorWidthInt() const;
int preferredVectorWidthLong() const;
int preferredVectorWidthFloat() const;
int preferredVectorWidthDouble() const;
int preferredVectorWidthHalf() const;
CV_WRAP int preferredVectorWidthChar() const;
CV_WRAP int preferredVectorWidthShort() const;
CV_WRAP int preferredVectorWidthInt() const;
CV_WRAP int preferredVectorWidthLong() const;
CV_WRAP int preferredVectorWidthFloat() const;
CV_WRAP int preferredVectorWidthDouble() const;
CV_WRAP int preferredVectorWidthHalf() const;
size_t printfBufferSize() const;
size_t profilingTimerResolution() const;
CV_WRAP size_t printfBufferSize() const;
CV_WRAP size_t profilingTimerResolution() const;
static const Device& getDefault();
CV_WRAP static const Device& getDefault();
protected:
struct Impl;

@ -3078,7 +3078,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
CV_Assert( val > 0 );
total *= _globalsize[i];
if (_globalsize[i] == 1)
if (_globalsize[i] == 1 && !_localsize)
val = 1;
globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
}

@ -145,6 +145,21 @@ OCL_INSTANTIATE_TEST_CASE_P(Core, Gemm, ::testing::Combine(
testing::Values(CV_32FC1, CV_32FC2, CV_64FC1, CV_64FC2),
Bool(), Bool(), Bool(), Bool()));
// Test for non-Intel GPUs to check CL_INVALID_WORK_GROUP_SIZE when localsize > globalsize
OCL_TEST(Gemm, small)
{
UMat A(2, 3, CV_32F), B(4, 3, CV_32F), uC(2, 4, CV_32F);
Mat C(2, 4, CV_32F);
randu(A, -1, 1);
randu(B, -1, 1);
OCL_OFF(cv::gemm(A, B, 1, noArray(), 0, C, GEMM_2_T));
OCL_ON(cv::gemm(A, B, 1, noArray(), 0, uC, GEMM_2_T));
EXPECT_LE(cvtest::norm(C, uC, cv::NORM_INF), 1e-5);
}
} } // namespace opencv_test::ocl
#endif // HAVE_OPENCL

@ -1078,12 +1078,22 @@ struct Net::Impl
}
#else
{
if (!DNN_OPENCL_ALLOW_ALL_DEVICES
&& !(ocl::Device::getDefault().isIntel() && ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU) // Current implementation is only valid for Intel GPU (#11494)
)
if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
{
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with Intel GPUs only), switching to CPU.");
preferableTarget = DNN_TARGET_CPU;
// Current implementation is only valid for GPU (#11494)
if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
{
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
preferableTarget = DNN_TARGET_CPU;
}
else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
{
CV_LOG_WARNING(NULL,
"DNN: OpenCL target with fp16 precision is not supported "
"with current OpenCL device (tested with Intel GPUs only), "
"switching to OpenCL with fp32 precision.");
preferableTarget = DNN_TARGET_OPENCL;
}
}
}
#endif

@ -230,8 +230,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -95,16 +95,9 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -237,16 +237,9 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -1529,8 +1529,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr));
if (inputs_arr.depth() == CV_16S)

@ -137,12 +137,6 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -415,8 +415,7 @@ public:
if (_bboxesNormalized)
{
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
}
if (inputs_arr.depth() == CV_16S)

@ -354,8 +354,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -135,16 +135,9 @@ public:
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
outputs_arr.isUMatVector() &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
outputs_arr.isUMatVector(),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -389,8 +389,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -148,8 +148,7 @@ public:
CV_Assert(inputs_arr.total() == outputs_arr.total());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -184,8 +184,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -99,19 +99,21 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
if (paddingType == "constant")
{
outputs[0].setTo(paddingValue);
if (inputs_arr.depth() == CV_16S)
{
std::vector<float> paddingValue_fp32(1, paddingValue);
std::vector<int16_t> paddingValue_fp16(1);
convertFp16(paddingValue_fp32, paddingValue_fp16);
outputs[0].setTo(paddingValue_fp16[0]);
}
else
outputs[0].setTo(paddingValue);
inputs[0].copyTo(outputs[0](dstRanges));
}
else if (paddingType == "reflect")

@ -304,8 +304,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -402,8 +402,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -196,8 +196,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -160,8 +160,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -233,16 +233,9 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -92,8 +92,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -239,16 +239,9 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -187,8 +187,7 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)

@ -83,12 +83,6 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (inputs_arr.depth() == CV_16S)
{
forward_fallback(inputs_arr, outputs_arr, internals_arr);
return;
}
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);

@ -69,9 +69,6 @@ bool OCL4DNNLRN<Dtype>::Forward(const UMat& bottom, UMat& top)
{
bool ret = true;
if (!ocl::Device::getDefault().intelSubgroupsSupport())
return false;
switch (lrn_type_)
{
case LRNParameter_NormRegion_ACROSS_CHANNELS:

@ -114,6 +114,6 @@ __kernel void clip(const int nthreads,
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
{
Dtype4 vec = vload4(index, dst);
vstore4(clamp(vec, 0, 1), index, dst);
vstore4(clamp(vec, 0.0f, 1.0f), index, dst);
}
}

@ -295,7 +295,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
TEST_P(Test_ONNX_nets, CNN_MNIST)
{
// output range: [-1952; 6574]
const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 3.82 : 4.3e-4;
const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 3.82 : 4.4e-4;
const double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 13.5 : 2e-3;
testONNXModels("cnn_mnist", pb, l1, lInf);
@ -341,7 +341,7 @@ TEST_P(Test_ONNX_nets, Inception_v2)
TEST_P(Test_ONNX_nets, DenseNet121)
{
// output range: [-87; 138]
const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.12 : 1.88e-5;
const double l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.12 : 2.2e-5;
const double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.74 : 1.23e-4;
testONNXModels("densenet121", pb, l1, lInf);
}

@ -95,7 +95,7 @@ if haveInfEngine:
if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
if haveInfEngine: # FIXIT Check Intel iGPU only
if haveInfEngine and cv.ocl_Device.getDefault().isIntel():
dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])

Loading…
Cancel
Save