Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/14161/head
Alexander Alekhin 6 years ago
commit 90df5e00b4
  1. 122
      3rdparty/include/opencl/1.2/CL/cl_d3d11_ext.h
  2. 8
      CMakeLists.txt
  3. 9
      cmake/OpenCVDetectOpenCL.cmake
  4. 3
      cmake/templates/cvconfig.h.in
  5. 1
      modules/core/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
  6. 551
      modules/core/src/directx.cpp
  7. 3
      modules/core/src/directx.inc.hpp
  8. 1
      modules/core/src/opencl/runtime/generator/template/opencl_core.hpp.in
  9. 1
      modules/core/src/opencl/runtime/opencl_core.cpp
  10. 2
      modules/dnn/src/layers/lrn_layer.cpp
  11. 31
      modules/dnn/src/onnx/onnx_importer.cpp
  12. 4
      modules/dnn/test/test_halide_layers.cpp
  13. 1
      modules/dnn/test/test_onnx_importer.cpp
  14. 115
      modules/imgproc/src/sumpixels.avx512_skx.cpp
  15. 2
      modules/imgproc/src/sumpixels.cpp
  16. 1
      samples/opencl/opencl-opencv-interop.cpp

@ -0,0 +1,122 @@
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_D3D11_EXT_H
#define __OPENCL_CL_D3D11_EXT_H
#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_nv_d3d11_sharing */
typedef cl_uint cl_d3d11_device_source_nv;
typedef cl_uint cl_d3d11_device_set_nv;
/******************************************************************************/
// Error Codes
#define CL_INVALID_D3D11_DEVICE_NV -1006
#define CL_INVALID_D3D11_RESOURCE_NV -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV -1009
// cl_d3d11_device_source_nv
#define CL_D3D11_DEVICE_NV 0x4019
#define CL_D3D11_DXGI_ADAPTER_NV 0x401A
// cl_d3d11_device_set_nv
#define CL_PREFERRED_DEVICES_FOR_D3D11_NV 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_NV 0x401C
// cl_context_info
#define CL_CONTEXT_D3D11_DEVICE_NV 0x401D
// cl_mem_info
#define CL_MEM_D3D11_RESOURCE_NV 0x401E
// cl_image_info
#define CL_IMAGE_D3D11_SUBRESOURCE_NV 0x401F
// cl_command_type
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)(
cl_platform_id platform,
cl_d3d11_device_source_nv d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_nv d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif // __OPENCL_CL_D3D11_H

@ -108,6 +108,10 @@ if(POLICY CMP0067)
cmake_policy(SET CMP0067 NEW)
endif()
if(POLICY CMP0068)
cmake_policy(SET CMP0068 NEW) # CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
endif()
include(cmake/OpenCVUtils.cmake)
ocv_cmake_reset_hooks()
ocv_check_environment_variables(OPENCV_CMAKE_HOOKS_DIR)
@ -368,6 +372,9 @@ OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON
OCV_OPTION(WITH_DIRECTX "Include DirectX support" ON
VISIBLE_IF WIN32 AND NOT WINRT
VERIFY HAVE_DIRECTX)
OCV_OPTION(WITH_OPENCL_D3D11_NV "Include NVIDIA OpenCL D3D11 support" WITH_DIRECTX
VISIBLE_IF WIN32 AND NOT WINRT
VERIFY HAVE_OPENCL_D3D11_NV)
OCV_OPTION(WITH_LIBREALSENSE "Include Intel librealsense support" OFF
VISIBLE_IF NOT WITH_INTELPERC
VERIFY HAVE_LIBREALSENSE)
@ -1570,6 +1577,7 @@ if(WITH_OPENCL OR HAVE_OPENCL)
IF HAVE_OPENCL_SVM THEN "SVM"
IF HAVE_CLAMDFFT THEN "AMDFFT"
IF HAVE_CLAMDBLAS THEN "AMDBLAS"
IF HAVE_OPENCL_D3D11_NV THEN "NVD3D11"
ELSE "no extra features")
status("")
status(" OpenCL:" HAVE_OPENCL THEN "YES (${opencl_features})" ELSE "NO")

@ -2,14 +2,19 @@ set(OPENCL_FOUND ON CACHE BOOL "OpenCL library is found")
if(APPLE)
set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
set(OPENCL_INCLUDE_DIR "" CACHE PATH "OpenCL include directory")
else(APPLE)
else()
set(OPENCL_LIBRARY "" CACHE STRING "OpenCL library")
set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2" CACHE PATH "OpenCL include directory")
ocv_install_3rdparty_licenses(opencl-headers "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/LICENSE.txt")
endif(APPLE)
endif()
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
if(OPENCL_FOUND)
if(WITH_OPENCL_D3D11_NV AND EXISTS "${OPENCL_INCLUDE_DIR}/CL/cl_d3d11_ext.h")
set(HAVE_OPENCL_D3D11_NV ON)
endif()
if(OPENCL_LIBRARY)
set(HAVE_OPENCL_STATIC ON)
set(OPENCL_LIBRARIES "${OPENCL_LIBRARY}")

@ -100,6 +100,9 @@
#cmakedefine HAVE_OPENCL_STATIC
#cmakedefine HAVE_OPENCL_SVM
/* NVIDIA OpenCL D3D Extensions support */
#cmakedefine HAVE_OPENCL_D3D11_NV
/* OpenEXR codec */
#cmakedefine HAVE_OPENEXR

@ -96,6 +96,7 @@
#define clWaitForEvents clWaitForEvents_
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>

@ -48,13 +48,13 @@
#ifdef HAVE_DIRECTX
#include <vector>
# include "directx.inc.hpp"
#include "directx.inc.hpp"
#else // HAVE_DIRECTX
#define NO_DIRECTX_SUPPORT_ERROR CV_Error(cv::Error::StsBadFunc, "OpenCV was build without DirectX support")
#endif
#ifndef HAVE_OPENCL
# define NO_OPENCL_SUPPORT_ERROR CV_Error(cv::Error::StsBadFunc, "OpenCV was build without OpenCL support")
#define NO_OPENCL_SUPPORT_ERROR CV_Error(cv::Error::StsBadFunc, "OpenCV was build without OpenCL support")
#endif // HAVE_OPENCL
namespace cv { namespace directx {
@ -168,7 +168,7 @@ int getTypeFromDXGI_FORMAT(const int iDXGI_FORMAT)
//case DXGI_FORMAT_BC7_TYPELESS:
//case DXGI_FORMAT_BC7_UNORM:
//case DXGI_FORMAT_BC7_UNORM_SRGB:
#ifdef HAVE_DIRECTX_NV12
#ifdef HAVE_DIRECTX_NV12 //D3DX11 should support DXGI_FORMAT_NV12.
case DXGI_FORMAT_NV12: return CV_8UC3;
#endif
default: break;
@ -256,75 +256,70 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available platforms");
std::vector<cl_platform_id> platforms(numPlatforms);
status = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
// TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
size_t exts_len;
cv::AutoBuffer<char> extensions;
bool is_support_cl_khr_d3d11_sharing = false;
#ifdef HAVE_OPENCL_D3D11_NV
bool is_support_cl_nv_d3d11_sharing = false;
#endif
for (int i = 0; i < (int)numPlatforms; i++)
{
status = clGetPlatformIDs(numPlatforms, &platforms[i], NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get number of platforms");
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &exts_len);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get length of CL_PLATFORM_EXTENSIONS");
extensions.resize(exts_len);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, exts_len, static_cast<void*>(extensions.data()), NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available CL_PLATFORM_EXTENSIONS");
if (strstr(extensions.data(), "cl_khr_d3d11_sharing"))
is_support_cl_khr_d3d11_sharing = true;
#ifdef HAVE_OPENCL_D3D11_NV
if (strstr(extensions.data(), "cl_nv_d3d11_sharing"))
is_support_cl_nv_d3d11_sharing = true;
#endif
}
#ifdef HAVE_OPENCL_D3D11_NV
if (!is_support_cl_nv_d3d11_sharing && !is_support_cl_khr_d3d11_sharing)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
#else
if (!is_support_cl_khr_d3d11_sharing)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
#endif
int found = -1;
cl_device_id device = NULL;
cl_uint numDevices = 0;
cl_context context = NULL;
// try with CL_PREFERRED_DEVICES_FOR_D3D11_KHR
for (int i = 0; i < (int)numPlatforms; i++)
{
clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
if (!clGetDeviceIDsFromD3D11KHR)
continue;
device = NULL;
numDevices = 0;
status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
if (status != CL_SUCCESS)
continue;
if (numDevices > 0)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
CL_CONTEXT_D3D11_DEVICE_KHR, (cl_context_properties)(pD3D11Device),
CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
NULL, NULL
};
context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS)
{
clReleaseDevice(device);
}
else
{
found = i;
break;
}
}
}
if (found < 0)
#ifdef HAVE_OPENCL_D3D11_NV
if (is_support_cl_nv_d3d11_sharing)
{
// try with CL_ALL_DEVICES_FOR_D3D11_KHR
// try with CL_PREFERRED_DEVICES_FOR_D3D11_NV
for (int i = 0; i < (int)numPlatforms; i++)
{
clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
if (!clGetDeviceIDsFromD3D11KHR)
clGetDeviceIDsFromD3D11NV_fn clGetDeviceIDsFromD3D11NV = (clGetDeviceIDsFromD3D11NV_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11NV");
if (!clGetDeviceIDsFromD3D11NV)
continue;
device = NULL;
numDevices = 0;
status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
CL_ALL_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
status = clGetDeviceIDsFromD3D11NV(platforms[i], CL_D3D11_DEVICE_NV, pD3D11Device,
CL_PREFERRED_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
if (status != CL_SUCCESS)
continue;
if (numDevices > 0)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
CL_CONTEXT_D3D11_DEVICE_KHR, (cl_context_properties)(pD3D11Device),
CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
NULL, NULL
CL_CONTEXT_D3D11_DEVICE_NV, (cl_context_properties)(pD3D11Device),
//CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
0
};
context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS)
{
@ -338,9 +333,127 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
}
}
if (found < 0)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
{
// try with CL_ALL_DEVICES_FOR_D3D11_NV
for (int i = 0; i < (int)numPlatforms; i++)
{
clGetDeviceIDsFromD3D11NV_fn clGetDeviceIDsFromD3D11NV = (clGetDeviceIDsFromD3D11NV_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11NV");
if (!clGetDeviceIDsFromD3D11NV)
continue;
device = NULL;
numDevices = 0;
status = clGetDeviceIDsFromD3D11NV(platforms[i], CL_D3D11_DEVICE_NV, pD3D11Device,
CL_ALL_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
if (status != CL_SUCCESS)
continue;
if (numDevices > 0)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
CL_CONTEXT_D3D11_DEVICE_NV, (cl_context_properties)(pD3D11Device),
//CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
0
};
context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS)
{
clReleaseDevice(device);
}
else
{
found = i;
break;
}
}
}
}
}
#endif
if (is_support_cl_khr_d3d11_sharing)
{
if (found < 0)
{
// try with CL_PREFERRED_DEVICES_FOR_D3D11_KHR
for (int i = 0; i < (int)numPlatforms; i++)
{
clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
if (!clGetDeviceIDsFromD3D11KHR)
continue;
device = NULL;
numDevices = 0;
status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
if (status != CL_SUCCESS)
continue;
if (numDevices > 0)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
CL_CONTEXT_D3D11_DEVICE_KHR, (cl_context_properties)(pD3D11Device),
CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
NULL, NULL
};
context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS)
{
clReleaseDevice(device);
}
else
{
found = i;
break;
}
}
}
}
if (found < 0)
{
// try with CL_ALL_DEVICES_FOR_D3D11_KHR
for (int i = 0; i < (int)numPlatforms; i++)
{
clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
if (!clGetDeviceIDsFromD3D11KHR)
continue;
device = NULL;
numDevices = 0;
status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
CL_ALL_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
if (status != CL_SUCCESS)
continue;
if (numDevices > 0)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
CL_CONTEXT_D3D11_DEVICE_KHR, (cl_context_properties)(pD3D11Device),
CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
NULL, NULL
};
context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
if (status != CL_SUCCESS)
{
clReleaseDevice(device);
}
else
{
found = i;
break;
}
}
}
}
}
if (found < 0)
{
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
}
Context& ctx = Context::getDefault(false);
initializeContextFromHandle(ctx, platforms[found], context, device);
@ -679,29 +792,85 @@ Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9
} // namespace cv::ocl
#if defined(HAVE_DIRECTX) && defined(HAVE_OPENCL)
#ifdef HAVE_OPENCL_D3D11_NV
clCreateFromD3D11Texture2DNV_fn clCreateFromD3D11Texture2DNV = NULL;
clEnqueueAcquireD3D11ObjectsNV_fn clEnqueueAcquireD3D11ObjectsNV = NULL;
clEnqueueReleaseD3D11ObjectsNV_fn clEnqueueReleaseD3D11ObjectsNV = NULL;
#endif
clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR = NULL;
clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR = NULL;
clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR = NULL;
static void __OpenCLinitializeD3D11()
static bool __OpenCLinitializeD3D11()
{
using namespace cv::ocl;
static cl_platform_id initializedPlatform = NULL;
cl_platform_id platform = (cl_platform_id)Platform::getDefault().ptr();
if (initializedPlatform != platform)
bool useCLNVEXT = false;
size_t exts_len;
cl_int status = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &exts_len);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get length of CL_PLATFORM_EXTENSIONS");
cv::AutoBuffer<char> extensions(exts_len);
status = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, exts_len, static_cast<void*>(extensions.data()), NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available CL_PLATFORM_EXTENSIONS");
bool is_support_cl_khr_d3d11_sharing = false;
if (strstr(extensions.data(), "cl_khr_d3d11_sharing"))
is_support_cl_khr_d3d11_sharing = true;
#ifdef HAVE_OPENCL_D3D11_NV
bool is_support_cl_nv_d3d11_sharing = false;
if (strstr(extensions.data(), "cl_nv_d3d11_sharing"))
is_support_cl_nv_d3d11_sharing = true;
if (!is_support_cl_nv_d3d11_sharing && !is_support_cl_khr_d3d11_sharing)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
#else
if (!is_support_cl_khr_d3d11_sharing)
CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
#endif
#ifdef HAVE_OPENCL_D3D11_NV
if (is_support_cl_nv_d3d11_sharing)
{
clCreateFromD3D11Texture2DKHR = (clCreateFromD3D11Texture2DKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DKHR");
clEnqueueAcquireD3D11ObjectsKHR = (clEnqueueAcquireD3D11ObjectsKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsKHR");
clEnqueueReleaseD3D11ObjectsKHR = (clEnqueueReleaseD3D11ObjectsKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsKHR");
initializedPlatform = platform;
if (initializedPlatform != platform)
{
clCreateFromD3D11Texture2DNV = (clCreateFromD3D11Texture2DNV_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DNV");
clEnqueueAcquireD3D11ObjectsNV = (clEnqueueAcquireD3D11ObjectsNV_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsNV");
clEnqueueReleaseD3D11ObjectsNV = (clEnqueueReleaseD3D11ObjectsNV_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsNV");
initializedPlatform = platform;
}
if (clCreateFromD3D11Texture2DNV && clEnqueueAcquireD3D11ObjectsNV && clEnqueueReleaseD3D11ObjectsNV)
{
useCLNVEXT = true;
}
}
if (!clCreateFromD3D11Texture2DKHR || !clEnqueueAcquireD3D11ObjectsKHR || !clEnqueueReleaseD3D11ObjectsKHR)
else
#endif
{
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D11");
if (is_support_cl_khr_d3d11_sharing)
{
if (initializedPlatform != platform)
{
clCreateFromD3D11Texture2DKHR = (clCreateFromD3D11Texture2DKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DKHR");
clEnqueueAcquireD3D11ObjectsKHR = (clEnqueueAcquireD3D11ObjectsKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsKHR");
clEnqueueReleaseD3D11ObjectsKHR = (clEnqueueReleaseD3D11ObjectsKHR_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsKHR");
initializedPlatform = platform;
}
if (!clCreateFromD3D11Texture2DKHR || !clEnqueueAcquireD3D11ObjectsKHR || !clEnqueueReleaseD3D11ObjectsKHR)
{
CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D11");
}
}
}
return useCLNVEXT;
}
#endif // defined(HAVE_DIRECTX) && defined(HAVE_OPENCL)
@ -762,14 +931,9 @@ bool ocl_convert_bgr_to_nv12(
namespace directx {
void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
#if defined(HAVE_DIRECTX) && defined(HAVE_OPENCL)
static void __convertToD3D11Texture2DKHR(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
{
CV_UNUSED(src); CV_UNUSED(pD3D11Texture2D);
#if !defined(HAVE_DIRECTX)
NO_DIRECTX_SUPPORT_ERROR;
#elif defined(HAVE_OPENCL)
__OpenCLinitializeD3D11();
D3D11_TEXTURE2D_DESC desc = { 0 };
pD3D11Texture2D->GetDesc(&desc);
@ -797,7 +961,6 @@ void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
#ifdef HAVE_DIRECTX_NV12
cl_mem clImageUV = 0;
#endif
clImage = clCreateFromD3D11Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DKHR failed");
@ -863,22 +1026,108 @@ void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
}
#endif
}
#endif
#else
// TODO memcpy
NO_OPENCL_SUPPORT_ERROR;
#if defined(HAVE_OPENCL_D3D11_NV)
static void __convertToD3D11Texture2DNV(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
{
D3D11_TEXTURE2D_DESC desc = { 0 };
pD3D11Texture2D->GetDesc(&desc);
int srcType = src.type();
int textureType = getTypeFromDXGI_FORMAT(desc.Format);
CV_Assert(textureType == srcType);
Size srcSize = src.size();
CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
UMat u = src.getUMat();
// TODO Add support for roi
CV_Assert(u.offset == 0);
CV_Assert(u.isContinuous());
cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
using namespace cv::ocl;
Context& ctx = Context::getDefault();
cl_context context = (cl_context)ctx.ptr();
cl_int status = 0;
cl_mem clImage = 0;
#ifdef HAVE_DIRECTX_NV12
cl_mem clImageUV = 0;
#endif
}
clImage = clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
#ifdef HAVE_DIRECTX_NV12
if (DXGI_FORMAT_NV12 == desc.Format)
{
clImageUV = clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 1, &status);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
}
#endif
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
status = clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst)
{
CV_UNUSED(pD3D11Texture2D); CV_UNUSED(dst);
#if !defined(HAVE_DIRECTX)
NO_DIRECTX_SUPPORT_ERROR;
#elif defined(HAVE_OPENCL)
__OpenCLinitializeD3D11();
#ifdef HAVE_DIRECTX_NV12
if(DXGI_FORMAT_NV12 == desc.Format)
{
status = clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
if(!ocl::ocl_convert_bgr_to_nv12(clBuffer, (int)u.step[0], u.cols, u.rows, clImage, clImageUV))
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_bgr_to_nv12 failed");
status = clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
}
else
#endif
{
size_t offset = 0; // TODO
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)u.cols, (size_t)u.rows, 1 };
status = clEnqueueCopyBufferToImage(q, clBuffer, clImage, offset, origin, region, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
}
status = clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
status = clFinish(q); // TODO Use events
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clFinish failed");
status = clReleaseMemObject(clImage); // TODO RAII
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
#ifdef HAVE_DIRECTX_NV12
if(DXGI_FORMAT_NV12 == desc.Format)
{
status = clReleaseMemObject(clImageUV);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
}
#endif
}
#endif
#if defined(HAVE_DIRECTX) && defined(HAVE_OPENCL)
static void __convertFromD3D11Texture2DKHR(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst)
{
D3D11_TEXTURE2D_DESC desc = { 0 };
pD3D11Texture2D->GetDesc(&desc);
@ -968,10 +1217,144 @@ void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
}
#endif
}
#endif
#if defined(HAVE_OPENCL_D3D11_NV)
static void __convertFromD3D11Texture2DNV(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst)
{
D3D11_TEXTURE2D_DESC desc = { 0 };
pD3D11Texture2D->GetDesc(&desc);
int textureType = getTypeFromDXGI_FORMAT(desc.Format);
CV_Assert(textureType >= 0);
// TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
dst.create(Size(desc.Width, desc.Height), textureType);
UMat u = dst.getUMat();
// TODO Add support for roi
CV_Assert(u.offset == 0);
CV_Assert(u.isContinuous());
cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
using namespace cv::ocl;
Context& ctx = Context::getDefault();
cl_context context = (cl_context)ctx.ptr();
cl_int status = 0;
cl_mem clImage = 0;
clImage = clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 0, &status);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
#ifdef HAVE_DIRECTX_NV12
cl_mem clImageUV = 0;
if(DXGI_FORMAT_NV12 == desc.Format)
{
clImageUV = clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 1, &status);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
}
#endif
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
status = clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
#ifdef HAVE_DIRECTX_NV12
if (DXGI_FORMAT::DXGI_FORMAT_NV12 == desc.Format)
{
status = clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
if (!ocl::ocl_convert_nv12_to_bgr(clImage, clImageUV, clBuffer, (int)u.step[0], u.cols, u.rows))
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_nv12_to_bgr failed");
status = clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
}
else
#endif
{
size_t offset = 0; // TODO
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)u.cols, (size_t)u.rows, 1 };
status = clEnqueueCopyImageToBuffer(q, clImage, clBuffer, origin, region, offset, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
}
status = clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
status = clFinish(q); // TODO Use events
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clFinish failed");
status = clReleaseMemObject(clImage); // TODO RAII
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
#ifdef HAVE_DIRECTX_NV12
if(DXGI_FORMAT_NV12 == desc.Format)
{
status = clReleaseMemObject(clImageUV);
if (status != CL_SUCCESS)
CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clReleaseMem failed");
}
#endif
}
#endif
void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
{
CV_UNUSED(src); CV_UNUSED(pD3D11Texture2D);
#if !defined(HAVE_DIRECTX)
NO_DIRECTX_SUPPORT_ERROR;
#elif !defined(HAVE_OPENCL)
NO_OPENCL_SUPPORT_ERROR;
#else
// TODO memcpy
bool useCLNVEXT = __OpenCLinitializeD3D11();
if(!useCLNVEXT){
__convertToD3D11Texture2DKHR(src,pD3D11Texture2D);
}
#ifdef HAVE_OPENCL_D3D11_NV
else
{
__convertToD3D11Texture2DNV(src,pD3D11Texture2D);
}
#endif
#endif
}
void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst)
{
CV_UNUSED(pD3D11Texture2D); CV_UNUSED(dst);
#if !defined(HAVE_DIRECTX)
NO_DIRECTX_SUPPORT_ERROR;
#elif !defined(HAVE_OPENCL)
NO_OPENCL_SUPPORT_ERROR;
#else
bool useCLNVEXT = __OpenCLinitializeD3D11();
if(!useCLNVEXT){
__convertFromD3D11Texture2DKHR(pD3D11Texture2D,dst);
}
#ifdef HAVE_OPENCL_D3D11_NV
else
{
__convertFromD3D11Texture2DNV(pD3D11Texture2D,dst);
}
#endif
#endif
}

@ -48,6 +48,9 @@
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
#include <CL/cl_d3d11.h>
#ifdef HAVE_OPENCL_D3D11_NV
#include <CL/cl_d3d11_ext.h>
#endif
#include <CL/cl_d3d10.h>
#include <CL/cl_dx9_media_sharing.h>
#endif // HAVE_OPENCL

@ -5,6 +5,7 @@
@CL_REMAP_ORIGIN@
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>

@ -47,6 +47,7 @@
#if defined(HAVE_OPENCL_STATIC)
#if defined __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>

@ -92,7 +92,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
return (bias == 1) && (preferableTarget != DNN_TARGET_MYRIAD || type == SPATIAL_NRM);
return bias == 1;
return backendId == DNN_BACKEND_OPENCV ||
backendId == DNN_BACKEND_HALIDE ||
(backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM));

@ -591,6 +591,37 @@ void ONNXImporter::populateNet(Net dstNet)
}
layerParams.set("num_output", layerParams.blobs[0].size[1] * layerParams.get<int>("group", 1));
layerParams.set("bias_term", node_proto.input_size() == 3);
if (layerParams.has("output_shape"))
{
const DictValue& outShape = layerParams.get("output_shape");
if (outShape.size() != 4)
CV_Error(Error::StsNotImplemented, "Output shape must have 4 elements.");
const int strideY = layerParams.get<int>("stride_h", 1);
const int strideX = layerParams.get<int>("stride_w", 1);
const int outH = outShape.getIntValue(2);
const int outW = outShape.getIntValue(3);
if (layerParams.get<String>("pad_mode") == "SAME")
{
layerParams.set("adj_w", (outW - 1) % strideX);
layerParams.set("adj_h", (outH - 1) % strideY);
}
else if (layerParams.get<String>("pad_mode") == "VALID")
{
if (!layerParams.has("kernel_h") || !layerParams.has("kernel_w"))
CV_Error(Error::StsNotImplemented,
"Required attributes 'kernel_h' and 'kernel_w' are not present.");
int kernelH = layerParams.get<int>("kernel_h");
int kernelW = layerParams.get<int>("kernel_w");
layerParams.set("adj_w", (outW - kernelW) % strideX);
layerParams.set("adj_h", (outH - kernelH) % strideY);
}
}
}
else if (layer_type == "Transpose")
{

@ -228,6 +228,10 @@ TEST_P(LRN, Accuracy)
Backend backendId = get<0>(get<5>(GetParam()));
Target targetId = get<1>(get<5>(GetParam()));
if ((inSize.width == 5 || inSize.height == 5) && targetId == DNN_TARGET_MYRIAD &&
nrmType == "ACROSS_CHANNELS")
throw SkipTestException("This test case is disabled");
LayerParams lp;
lp.set("norm_region", nrmType);
lp.set("local_size", localSize);

@ -73,6 +73,7 @@ TEST_P(Test_ONNX_layers, Deconvolution)
testONNXModels("deconvolution");
testONNXModels("two_deconvolution");
testONNXModels("deconvolution_group");
testONNXModels("deconvolution_output_shape");
}
TEST_P(Test_ONNX_layers, Dropout)

@ -13,7 +13,6 @@ namespace { // Anonymous namespace to avoid exposing the implementation classes
// NOTE: Look at the bottom of the file for the entry-point function for external callers
//
// TODO: Add support for 1 channel input (WIP: currently hitting hardware glassjaw)
template<size_t num_channels> class IntegralCalculator;
template<size_t num_channels>
@ -191,51 +190,55 @@ public:
}
// The calculate_integral function referenced here must be implemented in the templated derivatives
// because the algorithm depends heavily on the number of channels in the image
// This is the incomplete definition (just the prototype) here.
//
static CV_ALWAYS_INLINE
__m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator);
static CV_ALWAYS_INLINE
__m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask) {
__m512i read_64_bytes(const __m512i *srcs, const __mmask64 data_mask) {
return _mm512_maskz_loadu_epi8(data_mask, srcs);
}
static CV_ALWAYS_INLINE
__m128i extract_lower_16bytes(__m512i src_64byte_chunk) {
__m128i extract_lower_16bytes(const __m512i src_64byte_chunk) {
return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0);
}
static CV_ALWAYS_INLINE
__m512i convert_lower_8bytes_to_longs(__m128i src_16bytes) {
__m512i convert_lower_8bytes_to_longs(const __m128i src_16bytes) {
return _mm512_cvtepu8_epi64(src_16bytes);
}
static CV_ALWAYS_INLINE
__m512i square_m512(__m512i src_longs) {
__m512i square_m512(const __m512i src_longs) {
return _mm512_mullo_epi64(src_longs, src_longs);
}
static CV_ALWAYS_INLINE
__m128i shift_right_8_bytes(__m128i src_16bytes) {
__m128i shift_right_8_bytes(const __m128i src_16bytes) {
return _mm_maskz_compress_epi64(2, src_16bytes);
}
static CV_ALWAYS_INLINE
__m512i shift_right_16_bytes(__m512i src_64byte_chunk) {
__m512i shift_right_16_bytes(const __m512i src_64byte_chunk) {
return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
}
static CV_ALWAYS_INLINE
__m512i m512_hadd(const __m512i a){
return _mm512_add_epi64(_mm512_maskz_compress_epi64(0xAA, a), _mm512_maskz_compress_epi64(0x55, a));
}
// The calculate_integral function referenced here must be implemented in the templated derivatives
// because the algorithm depends heavily on the number of channels in the image
// This is the incomplete definition (just the prototype) here.
//
static CV_ALWAYS_INLINE
__m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator);
};
@ -246,7 +249,7 @@ public:
//
// The function prototype that needs to be implemented is:
//
// __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
// __m512d calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
//
// Description of parameters:
// INPUTS:
@ -265,12 +268,72 @@ public:
// Below here is the channel specific implementation
//
//========================================
// 1 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 1 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
// One channel support is implemented differently than 2, 3, or 4 channel
// One channel support has more horizontal operations that cannot be made vertical without losing performance
// The logical operations needed look like:
// Vertical LANES : |7|6|5|4|3|2|1|0|
// src_longs : |H|G|F|E|D|C|B|A|
// shift_by_1 : + |G|F|E|D|C|B|A| |
// shift_by_2 : + |F|E|D|C|B|A| | |
// shift_by_3 : + |E|D|C|B|A| | | |
// shift_by_4 : + |D|C|B|A| | | | |
// shift_by_5 : + |C|B|A| | | | | |
// shift_by_6 : + |B|A| | | | | | |
// shift_by_7 : + |A| | | | | | | |
// carry_over_idxs : + |7|7|7|7|7|7|7|7| (index position of result from previous iteration)
// = integral
//
// If we do this vertically we end up losing performance because of the number of operations. We will instead
// do a horizontal add tree to create the vertical sections we need as a tree
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// src_longs: | H | G | F | E | D | C | B | A |
// horiz_sum_1: | | | | | G+H | E+F | C+D | A+B |
// horiz_sum_2: | | | | | | | EFGH | ABCD |
//
const __m512i horiz_sum_1 = m512_hadd(src_longs); // indexes for the permutes below (3,2,1,0) = (GH, EF, CD, AB)
const __m512i horiz_sum_2 = m512_hadd(horiz_sum_1); // indexes for the permutes below (9, 8) = (EFGH, ABCD)
// Then we can use the partial sums by looking at the vertical stacks above and realize that, for example
// ABCD appears vertically in lanes 7, 6, 5, 4, and 3 so we will permute the values so that all partial products
// appear in the right lanes. and sum them up along with the carry over value from the accumulator. So we setup
// the lanes like:
// Vertical Lanes: | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// s1 : | 0 | G | 0 | E | 0 | C | 0 | A |
// s2 : | ABCD | ABCD | ABCD | ABCD | ABCD | AB | AB | 0 |
// s3 : | EFGH | EF | EF | 0 | 0 | 0 | 0 | 0 |
// +------+------+------+------+------+------+------+------+
// sum : | A..H | A..G | A..F | A..E | A..D | A..C | A..B | A | Integral :-)
//
const __m512i s1 = _mm512_maskz_mov_epi64(0x55, src_longs); // 0 G 0 E 0 D 0 C 0 A
const __m512i s2 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(8,8,8,8,8,0,0,4), horiz_sum_2);
const __m512i s3 = _mm512_permutex2var_epi64(horiz_sum_1, _mm512_set_epi64(9,2,2,4,4,4,4,4), horiz_sum_2);
// Now we use the rolling sum from the previous iteration from accumulator and replicate it into carry_over
// And sum everything up into the accumulator
//
const __m512i carry_over = _mm512_permutex2var_epi64(accumulator, _mm512_set_epi64(7,7,7,7,7,7,7,7), accumulator);
accumulator = _mm512_add_epi64(_mm512_add_epi64(s2, s3), _mm512_add_epi64(carry_over, s1));
// Convert to double precision and store
//
__m512d integral_pd = _mm512_add_pd(_mm512_cvtepu64_pd(accumulator), above_values);
return integral_pd;
}
//========================================
// 2 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 2 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
__m512d IntegralCalculator < 2 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6);
@ -300,12 +363,13 @@ __m512d IntegralCalculator < 2 > ::calculate_integral(__m512i src_longs, const _
return integral_pd;
}
//========================================
// 3 Channel Integral Implementation
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 3 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
__m512d IntegralCalculator < 3 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
@ -338,7 +402,7 @@ __m512d IntegralCalculator < 3 > ::calculate_integral(__m512i src_longs, const _
//========================================
template<>
CV_ALWAYS_INLINE
__m512d IntegralCalculator < 4 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
__m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, const __m512d above_values, __m512i &accumulator)
{
__m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4);
@ -376,18 +440,23 @@ void calculate_integral_avx512(const uchar *src, size_t _srcstep,
int width, int height, int cn)
{
switch(cn){
case 1: {
IntegralCalculator< 1 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 2: {
IntegralCalculator<2> calculator;
IntegralCalculator< 2 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 3: {
IntegralCalculator<3> calculator;
IntegralCalculator< 3 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
break;
}
case 4: {
IntegralCalculator<4> calculator;
IntegralCalculator< 4 > calculator;
calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
}
}

@ -77,7 +77,7 @@ struct Integral_SIMD<uchar, double, double> {
#if CV_TRY_AVX512_SKX
CV_UNUSED(_tiltedstep);
// TODO: Add support for 1 channel input (WIP)
if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && ((cn >= 2) && (cn <= 4))){
if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && (cn <= 4)){
opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
sqsum, _sqsumstep, width, height, cn);
return true;

@ -19,6 +19,7 @@
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS // eliminate build warning
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>

Loading…
Cancel
Save