diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 2cd5cac030..415ec2be79 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -665,6 +665,7 @@ CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) CV_EXPORTS const char* typeToStr(int t); CV_EXPORTS const char* memopTypeToStr(int t); CV_EXPORTS const char* vecopTypeToStr(int t); +CV_EXPORTS const char* getOpenCLErrorString(int errorCode); CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL); CV_EXPORTS void getPlatfomsInfo(std::vector& platform_info); @@ -731,6 +732,21 @@ protected: Impl* p; }; +class CV_EXPORTS Timer +{ +public: + Timer(const Queue& q); + ~Timer(); + void start(); + void stop(); + float milliSeconds(); + float microSeconds(); + float seconds(); + +protected: + struct Impl; + Impl* p; +}; CV_EXPORTS MatAllocator* getOpenCLAllocator(); diff --git a/modules/core/include/opencv2/core/utils/configuration.private.hpp b/modules/core/include/opencv2/core/utils/configuration.private.hpp new file mode 100644 index 0000000000..fa1b045178 --- /dev/null +++ b/modules/core/include/opencv2/core/utils/configuration.private.hpp @@ -0,0 +1,16 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CONFIGURATION_PRIVATE_HPP +#define OPENCV_CONFIGURATION_PRIVATE_HPP + +namespace cv { namespace utils { + +CV_EXPORTS bool getConfigurationParameterBool(const char* name, bool defaultValue); +CV_EXPORTS size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue); +CV_EXPORTS cv::String getConfigurationParameterString(const char* name, const char* defaultValue); + +}} // namespace + +#endif // OPENCV_CONFIGURATION_PRIVATE_HPP diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 8c24a04aea..60dca8bead 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -51,7 +51,10 @@ #include #endif +#include + #include "opencv2/core/ocl_genbase.hpp" +#include "opencl_kernels_core.hpp" #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0 #define CV_OPENCL_SHOW_RUN_ERRORS 0 @@ -4718,6 +4721,102 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) return buf; } +const char* getOpenCLErrorString(int errorCode) +{ + switch (errorCode) + { + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + case -69: return "CL_INVALID_PIPE_SIZE"; + case -70: return "CL_INVALID_DEVICE_QUEUE"; + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + case -1024: return "clBLAS: Functionality is not implemented"; + case -1023: return "clBLAS: Library is not initialized yet"; + case -1022: return "clBLAS: Matrix A is not a valid memory object"; + case -1021: return "clBLAS: Matrix B is not a valid memory object"; + case -1020: return "clBLAS: Matrix C is not a valid memory object"; + case -1019: return "clBLAS: Vector X is not a valid memory object"; + case -1018: return "clBLAS: Vector Y is not a valid memory object"; + case -1017: return "clBLAS: An input dimension (M:N:K) is invalid"; + case -1016: return "clBLAS: Leading dimension A must not be less than the " + "size of the first dimension"; + case -1015: return "clBLAS: Leading dimension B must not be less than the " + "size of the second dimension"; + case -1014: return "clBLAS: Leading dimension C must not be less than the " + "size of the third dimension"; + case -1013: return "clBLAS: The increment for a vector X must not be 0"; + case -1012: return "clBLAS: The increment for a vector Y must not be 0"; + case -1011: return "clBLAS: The memory object for Matrix A is too small"; + case -1010: return "clBLAS: The memory object for Matrix B is too small"; + case -1009: return "clBLAS: The memory object for Matrix C is too small"; + case -1008: return "clBLAS: The memory object for Vector X is too small"; + case -1007: return "clBLAS: The memory object for Vector Y is too small"; + default: return "Unknown OpenCL error"; + } +} + template static std::string kerToStr(const Mat & k) { @@ -5134,4 +5233,175 @@ bool internal::isCLBuffer(UMat& u) return true; } +struct Timer::Impl +{ + const Queue queue; + + Impl(const Queue& q) + : queue(q) + , initted_(false) + , running_(false) + , has_run_at_least_once_(false) + { + init(); + } + + ~Impl() + { + clWaitForEvents(1, &start_gpu_cl_); + clWaitForEvents(1, &stop_gpu_cl_); + clReleaseEvent(start_gpu_cl_); + clReleaseEvent(stop_gpu_cl_); + } + + void start() + { +#ifdef HAVE_OPENCL + if (!running()) + { + clWaitForEvents(1, &start_gpu_cl_); + clReleaseEvent(start_gpu_cl_); + ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc); + float arg = 0; + clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg); + clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0, + NULL, &start_gpu_cl_); + clFinish((cl_command_queue)queue.ptr()); + running_ = true; + has_run_at_least_once_ = true; + } +#endif + } + + void stop() + { +#ifdef HAVE_OPENCL + if (running()) + { + clWaitForEvents(1, &stop_gpu_cl_); + clReleaseEvent(stop_gpu_cl_); + ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc); + float arg = 0; + clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg); + clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0, + NULL, &stop_gpu_cl_); + clFinish((cl_command_queue)queue.ptr()); + running_ = false; + } +#endif + } + + float microSeconds() + { +#ifdef HAVE_OPENCL + if (!has_run_at_least_once()) + { + return 0; + } + if (running()) + { + stop(); + } + cl_ulong startTime, stopTime; + clWaitForEvents(1, &stop_gpu_cl_); + clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END, + sizeof startTime, &startTime, NULL); + clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START, + sizeof stopTime, &stopTime, NULL); + double us = static_cast(stopTime - startTime) / 1000.0; + elapsed_microseconds_ = static_cast(us); + return elapsed_microseconds_; +#else + return 0; +#endif + } + + float milliSeconds() + { +#ifdef HAVE_OPENCL + if (!has_run_at_least_once()) + { + return 0; + } + if (running()) + { + stop(); + } + cl_ulong startTime = 0, stopTime = 0; + clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END, + sizeof startTime, &startTime, NULL); + clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START, + sizeof stopTime, &stopTime, NULL); + double ms = static_cast(stopTime - startTime) / 1000000.0; + elapsed_milliseconds_ = static_cast(ms); + return elapsed_milliseconds_; +#else + return 0; +#endif + } + + float seconds() + { + return milliSeconds() / 1000.f; + } + + void init() + { + CV_Assert(queue.getImpl() && queue.getImpl()->isProfilingQueue_); + if (!initted()) + { + start_gpu_cl_ = 0; + stop_gpu_cl_ = 0; + initted_ = true; + } + } + + inline bool initted() { return initted_; } + inline bool running() { return running_; } + inline bool has_run_at_least_once() { return has_run_at_least_once_; } + + bool initted_; + bool running_; + bool has_run_at_least_once_; + float elapsed_milliseconds_; + float elapsed_microseconds_; + cl_event start_gpu_cl_; + cl_event stop_gpu_cl_; +}; + +Timer::Timer(const Queue& q) +{ + p = new Impl(q); +} + +Timer::~Timer() +{ + if(p) + { + delete p; + p = 0; + } +} + +void Timer::start() +{ + if(p) + p->start(); +} + +void Timer::stop() +{ + if(p) + p->stop(); +} + +float Timer::microSeconds() +{ return p ? p->microSeconds() : 0; } + +float Timer::milliSeconds() +{ return p ? p->milliSeconds() : 0; } + +float Timer::seconds() +{ return p ? p->seconds() : 0; } + }} diff --git a/modules/core/src/opencl/benchmark.cl b/modules/core/src/opencl/benchmark.cl new file mode 100644 index 0000000000..22acb93afd --- /dev/null +++ b/modules/core/src/opencl/benchmark.cl @@ -0,0 +1,45 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void null_kernel_float(float arg) { + float out = arg; +} diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index d38e20d34c..bb4ae6cf18 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -297,12 +297,6 @@ TLSData& getCoreTlsData(); #define CL_RUNTIME_EXPORT #endif -namespace utils { -bool getConfigurationParameterBool(const char* name, bool defaultValue); -size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue); -cv::String getConfigurationParameterString(const char* name, const char* defaultValue); -} - extern bool __termination; // skip some cleanups, because process is terminating // (for example, if ExitProcess() was already called) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 2ec150ecfd..0a4110ec12 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -44,6 +44,7 @@ #include "precomp.hpp" #include +#include #include namespace cv { diff --git a/modules/core/src/trace.cpp b/modules/core/src/trace.cpp index d9153642d7..230510625e 100644 --- a/modules/core/src/trace.cpp +++ b/modules/core/src/trace.cpp @@ -6,6 +6,7 @@ #include #include +#include #include // va_start diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 1be5fc6123..84cebdba09 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -267,19 +267,22 @@ UMat Mat::getUMat(int accessFlags, UMatUsageFlags usageFlags) const UMat hdr; if(!data) return hdr; - Size wholeSize; - Point ofs; - locateROI(wholeSize, ofs); - Size sz(cols, rows); - if (ofs.x != 0 || ofs.y != 0) + if (data != datastart) { - Mat src = *this; - int dtop = ofs.y; - int dbottom = wholeSize.height - src.rows - ofs.y; - int dleft = ofs.x; - int dright = wholeSize.width - src.cols - ofs.x; - src.adjustROI(dtop, dbottom, dleft, dright); - return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height)); + Size wholeSize; + Point ofs; + locateROI(wholeSize, ofs); + Size sz(cols, rows); + if (ofs.x != 0 || ofs.y != 0) + { + Mat src = *this; + int dtop = ofs.y; + int dbottom = wholeSize.height - src.rows - ofs.y; + int dleft = ofs.x; + int dright = wholeSize.width - src.cols - ofs.x; + src.adjustROI(dtop, dbottom, dleft, dright); + return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height)); + } } CV_Assert(data == datastart); diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index d0bc3324d2..866f544e8c 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -21,6 +21,8 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninit ) ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4701 /wd4100) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ocl4dnn/include ${OPENCL_INCLUDE_DIRS}) + if(MSVC) add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 ) ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146 diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index bd796691cc..cb015d4eba 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -297,6 +297,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN CV_PROP String name; //!< Name of the layer instance, can be used for logging or other internal purposes. CV_PROP String type; //!< Type name which was used for creating layer by layer factory. + CV_PROP int preferableTarget; //!< prefer target for layer forwarding Layer(); explicit Layer(const LayerParams ¶ms); //!< Initializes only #name, #type and #blobs fields. diff --git a/modules/dnn/perf/opencl/perf_convolution.cpp b/modules/dnn/perf/opencl/perf_convolution.cpp new file mode 100644 index 0000000000..362057919a --- /dev/null +++ b/modules/dnn/perf/opencl/perf_convolution.cpp @@ -0,0 +1,118 @@ +#include "../perf_precomp.hpp" +#include "opencv2/ts/ocl_perf.hpp" +#include + +#ifdef HAVE_OPENCL + +namespace cvtest +{ +namespace ocl +{ + +using std::tr1::tuple; +using std::tr1::get; +using std::tr1::make_tuple; +using std::make_pair; +using namespace perf; +using namespace testing; +using namespace cv; +using namespace cv::dnn; + +enum {STRIDE_OFF = 1, STRIDE_ON = 2}; +CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON); + +enum {GROUP_OFF = 1, GROUP_2 = 2}; +CV_ENUM(GroupSize, GROUP_OFF, GROUP_2); + +//Squared Size +#define SSZ(n) cv::Size(n, n) + +typedef std::pair InpShapeNumOut; +typedef tuple ConvParam; //kernel_size, inp shape, groups, stride +typedef TestBaseWithParam ConvolutionPerfTest; + +static inline MatShape blobShape(int count, int nplanes, int height, int width) +{ + int data[] = {count, nplanes, height, width}; + return MatShape(data, data+4); +} + +OCL_PERF_TEST_P( ConvolutionPerfTest, perf, Combine( + Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)), + Values(make_pair(blobShape(1, 4, 224, 224), 64), + make_pair(blobShape(1, 64, 112, 122), 128), + make_pair(blobShape(1, 256, 28, 28), 512)), + GroupSize::all(), + StrideSize::all()) +) +{ + RNG rng(0); + + ConvParam params = GetParam(); + int ksz = get<0>(params).width; + MatShape inpShape = get<1>(params).first; + int outCn = get<1>(params).second; + int groups = get<2>(params); + int stride = (ksz >= 11) ? 4 : (int)get<3>(params); + + int inpCn = inpShape[1]; + int wgtSize[] = { outCn, inpCn/groups, ksz, ksz }; + int biasSize[] = { outCn, 1, 1, 1 }; + const int wtype = CV_32F; + Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype); + Mat inpBlob(4, &inpShape[0], wtype); + rng.fill(biasBlob, RNG::UNIFORM, -1, +1); + rng.fill(wgtBlob, RNG::UNIFORM, -1, +1); + rng.fill(inpBlob, RNG::UNIFORM, -1, +1); + + LayerParams lp; + lp.set("num_output", outCn); + lp.set("group", groups); + lp.set("stride", stride); + lp.set("kernel_size", ksz); + lp.blobs.reserve(2); + lp.blobs.push_back(wgtBlob); + lp.blobs.push_back(biasBlob); + + std::vector inpBlobs(1, &inpBlob); + std::vector outBlobs, internalBlobs; + + cv::setNumThreads(cv::getNumberOfCPUs()); + + Ptr layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp); + std::vector inputShapes(1, shape(inpBlob)), outShapes, internals; + layer->getMemoryShapes(inputShapes, 0, outShapes, internals); + for (int i = 0; i < outShapes.size(); i++) + { + outBlobs.push_back(Mat(outShapes[i], CV_32F)); + } + for (int i = 0; i < internals.size(); i++) + { + internalBlobs.push_back(Mat()); + if (total(internals[i])) + internalBlobs.back().create(internals[i], CV_32F); + } + + layer->finalize(inpBlobs, outBlobs); + layer->preferableTarget = DNN_TARGET_OPENCL; + + Mat inpBlob2D = inpBlob.reshape(1, outCn); + Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups)); + Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]); + declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D).tbb_threads(cv::getNumThreads()); + + // warmup + layer->forward(inpBlobs, outBlobs, internalBlobs); + + TEST_CYCLE() + { + layer->forward(inpBlobs, outBlobs, internalBlobs); + } + + SANITY_CHECK_NOTHING(); +} + +} +} + +#endif diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index 55f5ce69e6..990470f655 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -40,7 +40,7 @@ public: if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL) { -#if 0 //defined(HAVE_OPENCL) +#if defined(HAVE_OPENCL) if (!cv::ocl::useOpenCL()) #endif { diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 64bb85f042..424e8425a4 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -875,7 +875,7 @@ struct Net::Impl if (preferableBackend == DNN_BACKEND_DEFAULT) { - CV_Assert(preferableTarget == DNN_TARGET_CPU); + CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); return; } @@ -1000,6 +1000,7 @@ struct Net::Impl Ptr layerPtr = ld.getLayerInstance(); { layerPtr->finalize(ld.inputBlobs, ld.outputBlobs); + layerPtr->preferableTarget = preferableTarget; #if 0 std::cout << "\toutputs:"; size_t noutputs = ld.outputBlobs.size(); @@ -1026,7 +1027,7 @@ struct Net::Impl void fuseLayers(const std::vector& blobsToKeep_) { - if( !fusion || preferableBackend == DNN_BACKEND_HALIDE ) + if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU)) return; CV_TRACE_FUNCTION(); @@ -1236,7 +1237,6 @@ struct Net::Impl } layersTimings.resize(lastLayerId + 1, 0); - fuseLayers(blobsToKeep_); } @@ -1402,7 +1402,7 @@ struct Net::Impl } else { - CV_Assert(preferableTarget == DNN_TARGET_CPU); + CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); } return ld.outputBlobs[pin.oid]; } @@ -1963,12 +1963,12 @@ int64 Net::getPerfProfile(std::vector& timings) Importer::~Importer() {} -Layer::Layer() {} +Layer::Layer() { preferableTarget = DNN_TARGET_CPU; } Layer::Layer(const LayerParams ¶ms) : blobs(params.blobs), name(params.name), type(params.type) { - + preferableTarget = DNN_TARGET_CPU; } void Layer::setParamsFrom(const LayerParams ¶ms) diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 67d82c2eb0..6833b0468b 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -43,6 +43,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" namespace cv { @@ -174,11 +175,62 @@ public: } }; +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + int cAxis = clamp(axis, inputs[0]->dims); + if (!(cAxis == 1 && outputs[0].dims == 4 && !padding)) + return false; + + int bottom_concat_axis; + int concat_size = inputs[0]->size[2] * inputs[0]->size[3]; + int top_concat_axis = outputs[0].size[1]; + int offset_concat_axis = 0; + UMat inpMat, outMat; + outMat = outputs[0].getUMat(ACCESS_WRITE); + + ocl::Kernel kernel; + String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0]->type()) + String(" "); + if (!kernel.create("concat", ocl::dnn::concat_oclsrc, buildopt)) + return false; + + for (size_t i = 0; i < inputs.size(); i++) + { + inpMat = inputs[i]->getUMat(ACCESS_READ); + bottom_concat_axis = inputs[i]->size[1]; + size_t nthreads = inputs[i]->total(); + + kernel.set(0, (int)nthreads); + kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat)); + kernel.set(2, (int)inputs[i]->size[0]); + kernel.set(3, (int)concat_size); + kernel.set(4, (int)top_concat_axis); + kernel.set(5, (int)bottom_concat_axis); + kernel.set(6, (int)offset_concat_axis); + kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat)); + + if (!kernel.run(1, &nthreads, NULL, false)) + return false; + + offset_concat_axis += bottom_concat_axis; + } + + return true; + } +#endif + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs, outputs, internals)) + int cAxis = clamp(axis, inputs[0]->dims); Mat& outMat = outputs[0]; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 8440662367..129b874ea0 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -47,6 +47,10 @@ #include "opencv2/core/hal/intrin.hpp" #include +#ifdef HAVE_OPENCL +using namespace cv::dnn::ocl4dnn; +#endif + namespace cv { namespace dnn @@ -150,6 +154,11 @@ public: Ptr bnorm; Ptr scaleLayer; +#ifdef HAVE_OPENCL + Ptr > convolutionOp; + std::vector umat_blobs; +#endif + MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const { Size out(outShape[3], outShape[2]); @@ -636,6 +645,42 @@ public: } }; +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + int group = inputs[0]->size[1] / umat_blobs[0].size[1]; + + if (convolutionOp.empty()) + { + OCL4DNNConvConfig config; + config.in_shape = shape(*inputs[0]); + config.out_shape = shape(outputs[0]); + config.kernel = kernel; + config.pad = pad; + config.stride = stride; + config.dilation = dilation; + config.group = group; + config.bias_term = (hasBias()) ? true : false; + + convolutionOp = Ptr >(new OCL4DNNConvSpatial(config)); + } + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + UMat inpMat, outMat; + inpMat = inputs[ii]->getUMat(ACCESS_READ); + outMat = outputs[ii].getUMat(ACCESS_WRITE); + + int batch_size = inpMat.size[0]; + + if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(), + outMat, batch_size)) + return false; + } + return true; + } +#endif + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); @@ -649,6 +694,10 @@ public: int ngroups = inputs[0]->size[1]/blobs[0].size[1]; CV_Assert(outputs[0].size[1] % ngroups == 0); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs, outputs, internals)) + int k, outCn = blobs[0].size[0]; if( weightsMat.empty() ) @@ -1203,8 +1252,17 @@ static void initConvDeconvLayerFromCaffe(Ptr l, const Laye Ptr ConvolutionLayer::create(const LayerParams ¶ms) { - Ptr l(new ConvolutionLayerImpl); + ConvolutionLayerImpl* conv_ptr = new ConvolutionLayerImpl; + Ptr l(conv_ptr); initConvDeconvLayerFromCaffe(l, params); + +#ifdef HAVE_OPENCL + size_t n = params.blobs.size(); + conv_ptr->umat_blobs.resize(n); + for (int i = 0; i < n; i++) + conv_ptr->umat_blobs[i] = params.blobs[i].getUMat(ACCESS_READ); +#endif + return l; } diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index dee3fbb825..027eda4cc2 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -41,9 +41,12 @@ //M*/ #include "../precomp.hpp" +#include "layers_common.hpp" #include "op_halide.hpp" #include "opencv2/imgproc.hpp" #include +#include "opencl_kernels_dnn.hpp" +#include namespace cv { @@ -158,6 +161,10 @@ public: { CV_TRACE_FUNCTION(); + CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + func.applyOCL(inputs, outputs, internals)) + for (size_t i = 0; i < inputs.size(); i++) { const Mat &src = *inputs[i]; @@ -191,6 +198,13 @@ public: bool run_parallel; }; +#ifdef HAVE_OPENCL +static String oclGetTMacro(const UMat &m) +{ + return String("-DT=") + ocl::typeToStr(m.type()) + String(" "); +} +#endif + struct ReLUFunctor { typedef ReLULayer Layer; @@ -230,6 +244,46 @@ struct ReLUFunctor } } +#ifdef HAVE_OPENCL + bool initKernel(ocl::Kernel &ker, const UMat &src) const + { + const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : ""; + String buildopt = oclGetTMacro(src) + buildoptSlope; + + if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt)) + return false; + + if (slope != 0) + ker.set(3, (float)slope); + + return true; + } + + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize(); + + for (size_t i = 0; i < inputs.size(); i++) + { + UMat src, dst; + inputs[i]->copyTo(src); + dst = outputs[i].getUMat(ACCESS_WRITE); + CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset); + + ocl::Kernel ker; + CV_Assert(initKernel(ker, src)); + ker.set(0, (int)src.total()); + ker.set(1, ocl::KernelArg::PtrReadOnly(src)); + ker.set(2, ocl::KernelArg::PtrWriteOnly(dst)); + + size_t gSize = src.total(); + CV_Assert(ker.run(1, &gSize, &wgSize, false)); + } + + return true; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -293,6 +347,14 @@ struct ReLU6Functor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -320,6 +382,14 @@ struct TanHFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -347,6 +417,14 @@ struct SigmoidFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -376,6 +454,14 @@ struct ELUFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -403,6 +489,14 @@ struct AbsValFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -430,6 +524,14 @@ struct BNLLFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -479,6 +581,14 @@ struct PowerFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { @@ -524,18 +634,18 @@ struct ChannelsPReLUFunctor v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32(); for( ; i <= len - 16; i += 16 ) { - v_float32x4 x0 = v_load(ptr + i); - v_float32x4 x1 = v_load(ptr + i + 4); - v_float32x4 x2 = v_load(ptr + i + 8); - v_float32x4 x3 = v_load(ptr + i + 12); + v_float32x4 x0 = v_load(srcptr + i); + v_float32x4 x1 = v_load(srcptr + i + 4); + v_float32x4 x2 = v_load(srcptr + i + 8); + v_float32x4 x3 = v_load(srcptr + i + 12); x0 = v_select(x0 >= z, x0, x0*s4); x1 = v_select(x1 >= z, x1, x1*s4); x2 = v_select(x2 >= z, x2, x2*s4); x3 = v_select(x3 >= z, x3, x3*s4); - v_store(ptr + i, x0); - v_store(ptr + i + 4, x1); - v_store(ptr + i + 8, x2); - v_store(ptr + i + 12, x3); + v_store(dstptr + i, x0); + v_store(dstptr + i + 4, x1); + v_store(dstptr + i + 8, x2); + v_store(dstptr + i + 12, x3); } #endif for( ; i < len; i++ ) @@ -546,6 +656,14 @@ struct ChannelsPReLUFunctor } } +#ifdef HAVE_OPENCL + bool applyOCL(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + // TODO: implement OCL version + return false; + } +#endif + #ifdef HAVE_HALIDE void attachHalide(const Halide::Expr& input, Halide::Func& top) { diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 9bec3b086f..7893a2f83a 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -43,8 +43,13 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" #include +#ifdef HAVE_OPENCL +using namespace cv::dnn::ocl4dnn; +#endif + namespace cv { namespace dnn @@ -55,6 +60,11 @@ class FullyConnectedLayerImpl : public InnerProductLayer public: enum { VEC_ALIGN = 8 }; +#ifdef HAVE_OPENCL + Ptr > innerProductOp; + std::vector umat_blobs; +#endif + FullyConnectedLayerImpl(const LayerParams& params) { setParamsFrom(params); @@ -84,6 +94,12 @@ public: biasMat = blobs[1] = blobs[1].reshape(1, 1); else biasMat = Mat::zeros(1, numOutput, weightsMat.type()); + +#ifdef HAVE_OPENCL + size_t n = blobs.size(); + umat_blobs.resize(n); + for (int i = 0; i < n; i++) umat_blobs[i] = blobs[i].getUMat(ACCESS_READ); +#endif } bool getMemoryShapes(const std::vector &inputs, @@ -238,11 +254,78 @@ public: bool useAVX2; }; +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &input, std::vector &output) + { + int axisCan = clamp(axis, input[0]->dims); + int numOutput = blobs[0].size[0]; + int innerSize = blobs[0].size[1]; + int outerSize = input[0]->total(0, axisCan); + bool ret = true; + + if (innerProductOp.empty()) + { + OCL4DNNInnerProductConfig config; + config.num_output = numOutput; + config.bias_term = bias; + config.M = outerSize; + config.K = innerSize; + + innerProductOp = Ptr >(new OCL4DNNInnerProduct(config)); + } + + UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); + for (size_t i = 0; i < input.size(); i++) + { + UMat srcMat, dstMat; + srcMat = input[i]->getUMat(ACCESS_READ); + dstMat = output[i].getUMat(ACCESS_WRITE); + dstMat.setTo(0.0f); + + if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat)) + { + ret = false; + break; + } + + if (bias && (outerSize > 1)) + { + UMat& biases = umat_blobs[1]; + cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); + } + } + + if (ret) return true; + + UMat& weights = umat_blobs[0]; + for (size_t i = 0; i < input.size(); i++) + { + UMat srcMat, dstMat; + srcMat = input[i]->reshape(1, outerSize).getUMat(ACCESS_READ); + dstMat = output[i].reshape(1, outerSize).getUMat(ACCESS_WRITE); + + cv::gemm(srcMat, weights, 1, noArray(), 0, dstMat, GEMM_2_T); + + if (bias) + { + UMat& biases = umat_blobs[1]; + cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); + } + } + + return true; + } +#endif + void forward(std::vector &input, std::vector &output, std::vector &) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(input, output)) + int axisCan = clamp(axis, input[0]->dims); int outerSize = input[0]->total(0, axisCan); diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp index 46170e9109..ed8add94ff 100644 --- a/modules/dnn/src/layers/layers_common.hpp +++ b/modules/dnn/src/layers/layers_common.hpp @@ -51,6 +51,10 @@ #include "layers/layers_common.simd_declarations.hpp" #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#ifdef HAVE_OPENCL +#include "ocl4dnn.hpp" +#endif + namespace cv { namespace dnn diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index aa7a7cbf4d..62dde95e90 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -46,8 +46,13 @@ #include "opencv2/imgproc.hpp" #include "opencv2/dnn/shape_utils.hpp" #include "opencv2/core/hal/hal.hpp" +#include "opencl_kernels_dnn.hpp" #include +#ifdef HAVE_OPENCL +using namespace cv::dnn::ocl4dnn; +#endif + namespace cv { namespace dnn @@ -78,18 +83,64 @@ public: normBySize = params.get("norm_by_size", true); } +#ifdef HAVE_OPENCL + Ptr > lrnOp; +#endif + virtual bool supportBackend(int backendId) { return backendId == DNN_BACKEND_DEFAULT || backendId == DNN_BACKEND_HALIDE && haveHalide(); } +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + if (lrnOp.empty()) + { + OCL4DNNLRNConfig config; + config.lrn_type = type == CHANNEL_NRM ? + LRNParameter_NormRegion_ACROSS_CHANNELS : + LRNParameter_NormRegion_WITHIN_CHANNEL; + + CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size"; + config.local_size = size; + config.alpha = alpha; + config.beta = beta; + config.k = bias; + CHECK_EQ(4, inputs[0]->dims) << "Input must have 4 axes, " + << "corresponding to (num, channels, height, width)"; + config.batch_size = inputs[0]->size[0]; + config.channels = inputs[0]->size[1]; + config.height = inputs[0]->size[2]; + config.width = inputs[0]->size[3]; + config.norm_by_size = normBySize; + + lrnOp = Ptr >(new OCL4DNNLRN(config)); + } + + UMat inpMat, outMat; + inpMat = inputs[0]->getUMat(ACCESS_READ); + outMat = outputs[0].getUMat(ACCESS_WRITE); + + if (!lrnOp->Forward(inpMat, outMat)) + return false; + + return true; + } +#endif + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_Assert(inputs.size() == outputs.size()); + + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs, outputs, internals)) + for (int i = 0; i < inputs.size(); i++) { CV_Assert(inputs[i]->dims == 4); diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index b54b52d7fc..c27315ba26 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -44,10 +44,14 @@ #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" #include #include using std::max; using std::min; +#ifdef HAVE_OPENCL +using namespace cv::dnn::ocl4dnn; +#endif namespace cv { @@ -81,6 +85,10 @@ public: ceilMode = params.get("ceil_mode", true); } +#ifdef HAVE_OPENCL + Ptr > poolOp; +#endif + void finalize(const std::vector &inputs, std::vector &outputs) { CV_Assert(inputs.size() == 1); @@ -104,11 +112,59 @@ public: type == PoolingLayer::AVE && !pad.width && !pad.height); } +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + if (poolOp.empty()) + { + OCL4DNNPoolConfig config; + + config.in_shape = shape(*inputs[0]); + config.out_shape = shape(outputs[0]); + config.kernel = kernel; + config.pad = pad; + config.stride = stride; + config.channels = inputs[0]->size[1]; + config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX : + (type == AVE ? LIBDNN_POOLING_METHOD_AVE : + LIBDNN_POOLING_METHOD_STO); + poolOp = Ptr >(new OCL4DNNPool(config)); + } + + for (size_t ii = 0; ii < inputs.size(); ii++) + { + UMat inpMat, outMat, maskMat; + + inpMat = inputs[ii]->getUMat(ACCESS_READ); + + if (type == MAX) + { + outMat = outputs[2 * ii].getUMat(ACCESS_WRITE); + maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE); + } else { + outMat = outputs[ii].getUMat(ACCESS_WRITE); + maskMat = UMat(); + } + + CV_Assert(inpMat.offset == 0 && outMat.offset == 0); + + if (!poolOp->Forward(inpMat, outMat, maskMat)) + return false; + } + + return true; + } +#endif + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs, outputs, internals)) + for (size_t ii = 0; ii < inputs.size(); ii++) { switch (type) diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 828557da49..fd14e29a05 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -43,9 +43,13 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" #include #include using std::max; +#ifdef HAVE_OPENCL +using namespace cv::dnn::ocl4dnn; +#endif namespace cv { @@ -63,6 +67,10 @@ public: setParamsFrom(params); } +#ifdef HAVE_OPENCL + Ptr > softmaxOp; +#endif + bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, @@ -82,11 +90,91 @@ public: backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1; } +#ifdef HAVE_OPENCL + bool forward_ocl(std::vector &inputs, std::vector &outputs, std::vector &internals) + { + if (softmaxOp.empty()) + { + OCL4DNNSoftmaxConfig config; + + config.in_shape = shape(*inputs[0]); + config.axis = axisRaw; + config.channels = inputs[0]->size[axisRaw]; + + softmaxOp = Ptr >(new OCL4DNNSoftmax(config)); + } + + UMat srcMat, dstMat; + srcMat = inputs[0]->getUMat(ACCESS_READ); + dstMat = outputs[0].getUMat(ACCESS_WRITE); + + if (!logSoftMax && softmaxOp->Forward(srcMat, dstMat)) + return true; + + const Mat &src = *inputs[0]; + UMat bufMat = internals[0].getUMat(ACCESS_WRITE); + srcMat.copyTo(dstMat); + + int axis = clamp(axisRaw, src.dims); + size_t outerSize = src.total(0, axis); + size_t channels = src.size[axis]; + size_t innerSize = src.total(axis + 1); + + String buildOpts = String("-DT=") + ocl::typeToStr(src.type()); + ocl::Kernel kmax, ksub, ksum, kdiv; + + if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts)) + return false; + + if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts)) + return false; + + if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts)) + return false; + + if (logSoftMax) buildOpts += " -DLOG_SOFTMAX "; + if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts)) + return false; + + size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize(); + size_t bufSize = internals[0].total(); + size_t totalSize = src.total(); + + kmax.args((int)outerSize, (int)channels, (int)innerSize, + ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); + if (!kmax.run(1, &bufSize, &wgSize, false)) + return false; + + ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, + ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); + if (!ksub.run(1, &totalSize, &wgSize, false)) + return false; + + cv::exp(dstMat, dstMat); + + ksum.args((int)outerSize, (int)channels, (int)innerSize, + ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); + if (!ksum.run(1, &bufSize, &wgSize, false)) + return false; + + kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, + ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); + if (!kdiv.run(1, &totalSize, &wgSize, false)) + return false; + + return true; + } +#endif + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs, outputs, internals)) + const Mat &src = *inputs[0]; Mat &dst = outputs[0]; diff --git a/modules/dnn/src/ocl4dnn/include/common.hpp b/modules/dnn/src/ocl4dnn/include/common.hpp new file mode 100644 index 0000000000..41466429b0 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/include/common.hpp @@ -0,0 +1,62 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _OPENCV_LIBDNN_COMMON_HPP_ +#define _OPENCV_LIBDNN_COMMON_HPP_ +#include "../../precomp.hpp" +#include "../../caffe/glog_emulator.hpp" +#include + +#ifdef HAVE_OPENCL + +// Macro to select the single (_float) or double (_double) precision kernel +#define CL_KERNEL_SELECT(kernel) kernel "_float" + +#define OCL_CHECK(condition) \ + do { \ + cl_int error = (condition); \ + CHECK_EQ(error, CL_SUCCESS) << " " << cv::ocl::getOpenCLErrorString(error); \ + } while (0) + +bool clOptionSupport(cv::String option); + +#endif // HAVE_OPENCL +#endif diff --git a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp new file mode 100644 index 0000000000..df3e321e31 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp @@ -0,0 +1,854 @@ +#ifndef _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ +#define _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ +const char *default_kernel_config_intel[] = { +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.0 + Platform Profile FULL_PROFILE + Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL +Number of devices 1 + Device Name Intel(R) HD Graphics + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.0 + Driver Version r4.1.61547 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 72 + Max clock frequency 950MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types by (0x7FE000000000) + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 26887677543 (25.04GiB) + Error Correction support No + Max memory allocation 4294959103 (4GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 4294959103 (4GiB) + Global Memory cache type Read/Write + Global Memory cache size 1572864 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 268434943 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 4294959103 (4GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform + clCreateContext(NULL, ...) [default] No platform + clCreateContext(NULL, ...) [other] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform +********************************************************************************/ +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ", +"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ", +"EU72_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","14 1 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 6 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ", +"EU72_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","12 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ", +"EU72_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 6 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","2 7 16 2 1 1 16 1 0 ", +"EU72_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 5 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","10 2 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ", +"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 3 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","2 7 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 2 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","8 2 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 6 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","10 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 6 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","8 3 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 7 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 5 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","6 4 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ", +"EU72_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ", +"EU72_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","8 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ", +"EU72_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 7 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 5 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","12 1 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ", +"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ", +"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ", +"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ", +"EU72_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","6 4 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 7 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","2 8 32 5 1 8 1 1 0 ", +"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 6 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ", +"EU72_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ", +"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ", +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.0 + Platform Profile FULL_PROFILE + Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL +Number of devices 1 + Device Name Intel(R) HD Graphics + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.0 + Driver Version 16.5.56875 + Device OpenCL C Version OpenCL C 2.0 ( using IGC ) + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 48 + Max clock frequency 950MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types by (0x7F4B00000000) + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 13361912218 (12.44GiB) + Error Correction support No + Max memory allocation 4294959103 (4GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 4294959103 (4GiB) + Global Memory cache type Read/Write + Global Memory cache size 1048576 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 268434943 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 4294959103 (4GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform + clCreateContext(NULL, ...) [default] No platform + clCreateContext(NULL, ...) [other] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform +********************************************************************************/ +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 16 32 5 1 16 1 1 0 ", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","8 1 16 2 1 1 16 1 0 ", +"EU48_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","2 7 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","6 4 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 3 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 7 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 2 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","2 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","2 8 16 2 1 1 16 1 0 ", +"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","12 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","8 2 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 7 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ", +"EU48_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","2 8 32 5 1 8 1 1 0 ", +"EU48_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","4 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","6 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 7 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 7 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 4 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ", +"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 5 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","10 2 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 5 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 5 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ", +"EU48_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 3 16 2 1 1 16 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 2 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 5 16 2 1 1 16 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 2 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","4 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU48_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","8 2 16 2 1 1 16 1 0 ", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ", +"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ", +"EU48_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU48_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 7 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","12 2 8 2 1 1 8 1 0 ", +"EU48_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 4 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","1 8 32 5 1 8 1 1 0 ", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ", +"EU48_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ", +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.0 + Platform Profile FULL_PROFILE + Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL +Number of devices 1 + Device Name Intel(R) HD Graphics + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.0 + Driver Version 16.5.59288 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 24 + Max clock frequency 1050MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types by (0x7F5100000000) + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 6588802663 (6.136GiB) + Error Correction support No + Max memory allocation 3294401331 (3.068GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 3294401331 (3.068GiB) + Global Memory cache type Read/Write + Global Memory cache size 524288 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 205900083 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 3294401331 (3.068GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform + clCreateContext(NULL, ...) [default] No platform + clCreateContext(NULL, ...) [other] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform +********************************************************************************/ +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ", +"EU24_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 7 16 2 1 1 16 1 0 ", +"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","8 3 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","4 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ", +"EU24_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 7 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","6 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 8 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","10 2 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU24_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 6 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ", +"EU24_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","2 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","8 1 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 8 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 16 2 1 1 16 1 0 ", +"EU24_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ", +"EU24_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 3 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ", +"EU24_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 3 8 2 1 1 8 1 0 ", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","4 3 16 2 1 1 16 1 0 ", +}; +#endif diff --git a/modules/dnn/src/ocl4dnn/include/math_functions.hpp b/modules/dnn/src/ocl4dnn/include/math_functions.hpp new file mode 100644 index 0000000000..cac860490f --- /dev/null +++ b/modules/dnn/src/ocl4dnn/include/math_functions.hpp @@ -0,0 +1,90 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_ +#define _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_ +#include "../../precomp.hpp" +#include "common.hpp" + +namespace cv +{ +namespace dnn +{ +namespace ocl4dnn +{ + +#ifdef HAVE_OPENCL +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; + +template +bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB, + const int32_t M, const int32_t N, const int32_t K, + const UMat A, const UMat B, + const UMat B_image, UMat C, + const size_t max_image_size); + +template +ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset, + bool is_matrix_a, bool transpose, + bool padding, int padded_height, + int padded_width, int height, + int width, int ld); + +template +bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA, + const int32_t M, const int32_t N, const Dtype alpha, + const UMat A, const int32_t offA, const UMat x, + const int32_t offx, const Dtype beta, UMat y, + const int32_t offy); + +template +bool ocl4dnnAXPY(const int32_t N, const Dtype alpha, + const UMat x, const int32_t offx, UMat y, + const int32_t offy); + +#endif // HAVE_OPENCL + +} // namespace ocl4dnn +} // namespace dnn +} // namespce cv + +#endif diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp new file mode 100644 index 0000000000..603c0aade0 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -0,0 +1,473 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _OPENCV_LIBDNN_HPP_ +#define _OPENCV_LIBDNN_HPP_ +#include "../../precomp.hpp" +#include +#include +#include +#include +#include +#include "common.hpp" + +namespace cv { namespace dnn { namespace ocl4dnn { +#ifdef HAVE_OPENCL + +struct OCL4DNNConvConfig +{ + OCL4DNNConvConfig() : + kernel(1, 1), + pad(0, 0), + stride(1, 1), + dilation(1, 1), + group(1), + bias_term(false) + {} + MatShape in_shape; + MatShape out_shape; + Size kernel; + Size pad; + Size stride; + Size dilation; + int group; // = 1; + bool bias_term; // = false; +}; + + +template +class OCL4DNNConvSpatial +{ + public: + explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config); + ~OCL4DNNConvSpatial(); + bool Forward(const UMat& bottom_data, const UMat& weight, + const UMat& bias, + UMat& top_data, int32_t batch_size); + + private: + struct kernelConfig + { + std::string kernelName; + float executionTime; + size_t local_work_size[3]; + size_t global_work_size[3]; + int32_t workItem_output[3]; + bool verified; + bool tested; + bool swizzle_weights; + bool use_null_local; + int32_t kernelType; + + kernelConfig() + {} + + kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size, + const int32_t* workItem, + bool swizzle, + int32_t type = 0) + : executionTime(0) + { + kernelName = name; + for (int32_t x = 0; x < 3; x++) + { + local_work_size[x] = local_size ? local_size[x] : 1; + global_work_size[x] = global_size[x]; + workItem_output[x] = workItem[x]; + } + swizzle_weights = swizzle; + use_null_local = local_size == NULL; + verified = false; + tested = false; + kernelType = type; + } + }; + + struct tunerParam + { + int kernelType; + int blockWidth; + int blockHeight; + int blockDepth; + + tunerParam(int type, int w, int h, int d) + { + kernelType = type; + blockWidth = w; + blockHeight= h; + blockDepth = d; + } + }; + + inline void addDef(const char* name) + { + options_ << " -D " << name; + } + + inline void addDef(const char* name, const int value) + { + options_ << " -D " << name << "=" << value; + } + + inline void addDef(const char* name, const float value) + { + options_ << " -D " << name << "=(float)" << value; + } + + inline void addDef(const char* name, const double value) + { + options_ << " -D " << name << "=(double)" << value; + } + + inline void addDef(const char* name, const char* value) + { + options_ << " -D " << name << "=" << value; + } + + void useFirstAvailable(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImages, + UMat &verifyTop); + void setupKernel(); + void collectCommonInformation(); + void setupKernelDetails(int32_t kernelType, + int32_t blockM, + int32_t blockK, + int32_t blockN); + + ocl::Program compileKernel(); + typedef std::map phash_t; + phash_t phash; + void calculateBenchmark(const UMat &bottom, UMat &verifyTop, + const UMat &weight, const UMat &bias, + int32_t numImages); + + + void setupConvolution(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImags, + UMat &verifyTop); + bool createConvolutionKernel(int32_t kernelType, + int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth); + bool setupIDLF(int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth); + bool createBasicKernel(int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth); + bool createGEMMLikeConvKernel(int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth); + void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer, + int32_t offset, int32_t size, bool write_only); + bool convolve(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages, + kernelConfig* config, + const cv::ocl::Queue& queue); + float timedConvolve(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages, kernelConfig* config); + + bool verifyResult(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImages, + kernelConfig* config, + UMat &verifyTop); + + bool swizzleWeight(const UMat &weight, + int32_t swizzled_factor, + bool interleave = false); + + void generateKey(); + std::string generateSpecificKey(int32_t type, int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth); + void cacheTunedConfig(); + bool loadTunedConfig(); + + void saveTunedConfig(); + bool loadCachedConfig(); + + void unloadProgram(const std::string& kernelName); + void prepareKernel(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages); + bool setupKernelByConfig(int x, int y, int z, int type, + int lx, int ly, int lz, + bool swizzle, bool nullLocal); + void generateTunerItems(std::vector< cv::Ptr > &tunerItems); + + int32_t group_; + bool bias_term_; + UMat swizzled_weights_umat; + + int32_t bottom_index_; + int32_t output_h_; + int32_t output_w_; + int32_t kernel_h_; + int32_t kernel_w_; + int32_t height_; + int32_t width_; + int32_t pad_h_; + int32_t pad_w_; + int32_t stride_h_; + int32_t stride_w_; + int32_t dilation_h_; + int32_t dilation_w_; + + /// M_ is the channel dimension of the output for a single group, which is the + /// leading dimension of the filter matrix. + int32_t M_; + + bool tuned_; + std::string key_, key_sanitized_; + std::string short_key_; + std::string kernel_name_; + std::string cache_path_; + bool use_cache_path_; // true if cache_path_ directory exists + bool force_auto_tuning_; + int32_t kernel_index_; + std::vector< cv::Ptr > kernelQueue; + cv::Ptr bestKernelConfig; + + int32_t bottom_dim_; + int32_t top_dim_; + int32_t num_; + int32_t channels_; + int32_t num_output_; + + int32_t kernelType_; + int32_t blockM_; + int32_t blockK_; + int32_t blockN_; + std::stringstream options_; + cv::ocl::ProgramSource src_; + int32_t prev_kernel_type_; +}; + +typedef enum { + LIBDNN_POOLING_METHOD_MAX = 0, + LIBDNN_POOLING_METHOD_AVE = 1, + LIBDNN_POOLING_METHOD_STO = 2 +} ocl4dnnPoolingMethod_t; + +struct OCL4DNNPoolConfig +{ + OCL4DNNPoolConfig() : + kernel(1, 1), + pad(0, 0), + stride(1, 1), + dilation(1, 1), + channels(0), + pool_method(LIBDNN_POOLING_METHOD_MAX), + global_pooling(false) + {} + MatShape in_shape; + MatShape out_shape; + Size kernel; + Size pad; + Size stride; + Size dilation; + + int channels; + ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX; + bool global_pooling; // = false; +}; + +template +class OCL4DNNPool +{ + public: + explicit OCL4DNNPool(OCL4DNNPoolConfig config); + ~OCL4DNNPool(); + bool Forward(const UMat& bottom_data, + UMat& top_data, + UMat& top_mask); + private: + UMat mask_idx_; + + // Pooling parameters + std::vector pad_; + std::vector stride_; + std::vector kernel_shape_; + std::vector im_in_shape_; + std::vector im_out_shape_; + + ocl4dnnPoolingMethod_t pool_method_; + int32_t count_; + int32_t batch_size_; + int32_t channels_; + int32_t kernel_h_; + int32_t kernel_w_; + int32_t stride_h_; + int32_t stride_w_; + int32_t pad_h_; + int32_t pad_w_; + int32_t height_; + int32_t width_; + int32_t pooled_height_; + int32_t pooled_width_; +}; + +struct OCL4DNNInnerProductConfig +{ + OCL4DNNInnerProductConfig() : + num_output(0), M(0), K(0), + bias_term(false), transpose(false), phase_test(true) + {} + int num_output; + int M; + int K; + bool bias_term; + bool transpose; // = false; + bool phase_test; // = true; +}; + +template +class OCL4DNNInnerProduct +{ + public: + explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config); + ~OCL4DNNInnerProduct(); + bool Forward(const UMat& bottom_data, + const UMat& weight, + const UMat& bias, + UMat& top_data); + private: + OCL4DNNInnerProductConfig config_; + int32_t axis_; + int32_t num_output_; + int32_t M_; + int32_t N_; + int32_t K_; + bool bias_term_; + bool transpose_; + bool image_copied_; + bool phase_test_; +}; + +typedef enum { + LRNParameter_NormRegion_ACROSS_CHANNELS = 0, + LRNParameter_NormRegion_WITHIN_CHANNEL = 1 +} LRNParameter_NormRegion_WITHIN_CHANNEL_t; + +struct OCL4DNNLRNConfig +{ + OCL4DNNLRNConfig() : + phase_test(true) + {} + MatShape in_shape; + LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type; + bool phase_test; // = true; + int local_size; + float alpha; + float beta; + float k; + bool norm_by_size; + int32_t batch_size; + int32_t channels; + int32_t height; + int32_t width; +}; + +template +class OCL4DNNLRN +{ + public: + explicit OCL4DNNLRN(OCL4DNNLRNConfig config); + bool Forward(const UMat& bottom_data, UMat& top_data); + + private: + bool crossChannelForward(const UMat& bottom_data, UMat& top_data); + LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_; + bool phase_test_; + int32_t size_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int32_t num_; + int32_t channels_; + int32_t height_; + int32_t width_; + bool norm_by_size_; +}; + +struct OCL4DNNSoftmaxConfig +{ + OCL4DNNSoftmaxConfig() + {} + MatShape in_shape; + int axis; + int channels; +}; + +template +class OCL4DNNSoftmax +{ + public: + explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config); + ~OCL4DNNSoftmax(); + bool Forward(const UMat& bottom_data, UMat& top_data); + + private: + int32_t softmax_axis_; + int32_t inner_num_; + int32_t outer_num_; + int32_t channels_; + int32_t count_; + bool use_slm_; + UMat scale_data_; +}; +#endif // HAVE_OPENCL +} // namespace ocl4dnn +} // namespace dnn +} // namespce cv +#endif diff --git a/modules/dnn/src/ocl4dnn/src/common.cpp b/modules/dnn/src/ocl4dnn/src/common.cpp new file mode 100644 index 0000000000..5a18c41110 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/common.cpp @@ -0,0 +1,57 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include "common.hpp" +#include "opencl_kernels_dnn.hpp" + +using namespace cv; + +#ifdef HAVE_OPENCL +bool clOptionSupport(cv::String option) +{ + cv::String errmsg; + ocl::Program program = ocl::Context::getDefault().getProg(ocl::dnn::dummy_oclsrc, option, errmsg); + return program.ptr() ? true : false; +} + +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp new file mode 100644 index 0000000000..42b35572aa --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp @@ -0,0 +1,538 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include "common.hpp" +#include "math_functions.hpp" +#include +#include "opencl_kernels_dnn.hpp" + +namespace cv +{ +namespace dnn +{ +namespace ocl4dnn +{ + +#ifdef HAVE_OPENCL +// Create and copy buffer to image for GEMM's matrix A and B. +// Will return image to caller if the input image is NULL. Otherwise, +// will use the image directly. It's caller's responsibility to +// release the created image. +template +ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset, + bool is_matrix_a, bool transpose, + bool padding, int padded_height, + int padded_width, int height, + int width, int ld) +{ + ocl::Context ctx = ocl::Context::getDefault(); + ocl::Queue queue = ocl::Queue::getDefault(); + ocl::Image2D image; + + if (!is_matrix_a && transpose) + { + if (ld == width) + { + image = ocl::Image2D(buffer); + } else { + // For matrix B with transpose, we need to handle them differently. + // As we can't use the sub group block read to get a row easily, + // we have to use CL_FLOAT type with read_imagef to get the row. + UMat mat(height, width, CV_32FC1); + image = ocl::Image2D(mat); + + ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc); + + size_t global_copy[2]; + global_copy[0] = width; + global_copy[1] = height; + oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer)); + oclk_gemm_copy.set(1, image); + oclk_gemm_copy.set(2, offset); + oclk_gemm_copy.set(3, width); + oclk_gemm_copy.set(4, height); + oclk_gemm_copy.set(5, ld); + oclk_gemm_copy.run(2, global_copy, NULL, false); + } + } else { + if (!padding) + { + // copy without padding. + image = ocl::Image2D(buffer); + } else { + UMat mat(padded_height, padded_width, CV_8UC4); + image = ocl::Image2D(mat); + + ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float", + ocl::dnn::gemm_image_oclsrc); + + size_t global_copy[2]; + global_copy[0] = padded_width; + global_copy[1] = padded_height; + + oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer)); + oclk_gemm_copy.set(1, image); + oclk_gemm_copy.set(2, offset); + oclk_gemm_copy.set(3, width); + oclk_gemm_copy.set(4, height); + oclk_gemm_copy.set(5, ld); + + oclk_gemm_copy.run(2, global_copy, NULL, false); + } + } + + return image; +} + +template +ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset, + bool is_matrix_a, bool transpose, + bool padding, int padded_height, + int padded_width, int height, + int width, int ld); + +enum gemm_type_t +{ + GEMM_TYPE_NONE = 0, + GEMM_TYPE_FAST_IMAGE_32_1, + GEMM_TYPE_FAST_IMAGE_32_2, + GEMM_TYPE_FAST_IMAGE_B_IMAGE, + GEMM_TYPE_MAX +}; + +template +static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int32_t M, + const int32_t N, const int32_t K, const Dtype alpha, + const UMat A, const int32_t offA, const UMat B, + const int32_t offB, const Dtype beta, UMat C, + const int32_t offC, bool is_image_a, bool is_image_b, + enum gemm_type_t gemm_type, + const size_t max_image_size) +{ + CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 || + gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl; + + if (is_image_a) + { + CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl; + return false; + } + + if (is_image_b) + { + CHECK_EQ(offB, 0) << "Invalid input image offset." << std::endl; + return false; + } + + int widthA = (TransA == CblasNoTrans) ? K : M; + int heightA = (TransA == CblasNoTrans) ? M : K; + int widthB = (TransB == CblasNoTrans) ? N : K; + int heightB = (TransB == CblasNoTrans) ? K : N; + + int ldA = widthA; + int ldB = widthB; + int ldC = N; + + int A_start_x = 0, A_start_y = 0, B_start_x = 0; + int B_start_y = 0, C_start_x = 0, C_start_y = 0; + int blocksize = 1024; + if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) + blocksize = max_image_size; + int blockA_width = blocksize; + int blockA_height = blocksize; + int blockB_width = blocksize; + int blockB_height = blocksize; + int blockC_width = blocksize; + int blockC_height = blocksize; + + int use_buffer_indicator = 8; + // To fix the edge problem casued by the sub group block read. + // we have to pad the image if it's not multiple of tile. + // just padding one line is enough as the sub group block read + // will clamp to edge according to the spec. + + ocl::Context ctx = ocl::Context::getDefault(); + ocl::Queue queue = ocl::Queue::getDefault(); + + ocl::Image2D ImA; + ocl::Image2D ImB; + + std::string kernel_name("gemm_"); + if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) + kernel_name += "32_1_"; + else + kernel_name += "32_2_"; + + if (TransA == CblasNoTrans) + kernel_name += "N"; + else + kernel_name += "T"; + + if (TransB == CblasNoTrans) + { + kernel_name += "N_"; + } else { + kernel_name += "T_"; + if (is_image_b || (K % use_buffer_indicator != 0)) + { + kernel_name += "SCALAR_"; + } else { + kernel_name += "BUFFER_"; + } + } + + if (alpha == 1) + kernel_name += "1_"; + else + kernel_name += "0_"; + + if (beta == 0) + kernel_name += "0"; + else + kernel_name += "1"; + + kernel_name += "_float"; + + ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc); + if (oclk_gemm_float.empty()) + return false; + + while (C_start_y < M) + { + blockC_width = std::min(static_cast(N) - C_start_x, blocksize); + blockC_height = std::min(static_cast(M) - C_start_y, blocksize); + + int isFirstColBlock = 1; + for (int k = 0; k < K; k += blocksize) + { + blockA_width = std::min(widthA - A_start_x, blocksize); + blockA_height = std::min(heightA - A_start_y, blocksize); + blockB_width = std::min(widthB - B_start_x, blocksize); + blockB_height = std::min(heightB - B_start_y, blocksize); + int block_Ksize = std::min(static_cast(K) - k, blocksize); + + int padded_k = block_Ksize + ((block_Ksize & 7) ? (8 - (block_Ksize & 7)) : 0); + int imageA_w = (TransA == CblasNoTrans) ? padded_k : blockA_width; + int imageA_h = (TransA == CblasNoTrans) ? blockA_height : padded_k; + int imageB_w = (TransB == CblasNoTrans) ? blockB_width : padded_k; + int imageB_h = (TransB == CblasNoTrans) ? padded_k : blockB_height; + + int blockA_offset = offA + A_start_y * ldA + A_start_x; + int blockB_offset = offB + B_start_y * ldB + B_start_x; + int blockC_offset = offC + C_start_y * ldC + C_start_x; + if (TransB == CblasNoTrans) + { + bool padding_A = false; + bool padding_B = false; + + if (!is_image_a && !is_image_b) + { + if (M * K < N * K) + padding_B = true; + else + padding_A = true; + } + + if (!is_image_a) + { + ImA = ocl4dnnGEMMCopyBufferToImage(A, blockA_offset, + true, TransA != CblasNoTrans, + padding_A, imageA_h, imageA_w, + blockA_height, blockA_width, ldA); + } + if (!is_image_b) + { + ImB = ocl4dnnGEMMCopyBufferToImage(B, blockB_offset, + false, false, + padding_B, imageB_h, imageB_w, + blockB_height, blockB_width, ldB); + } + } else { + // We will use normal read_imagef to read image B when B has transpose. + // thus we don't need to pad image A at all. + if (!is_image_a) + { + bool padding; + padding = !is_image_b; + ImA = ocl4dnnGEMMCopyBufferToImage(A, blockA_offset, + true, TransA != CblasNoTrans, + padding, imageA_h, imageA_w, + blockA_height, blockA_width, ldA); + } + + if (!is_image_b && (K % use_buffer_indicator != 0)) + { + ImB = ocl4dnnGEMMCopyBufferToImage(B, blockB_offset, + false, true, false, imageB_h, imageB_w, + blockB_height, blockB_width, ldB); + } + } + + size_t global[2]; + if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) + { + global[0] = (size_t)( blockC_width + 7 ) & ~7; + } else { + global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7; + } + global[1] = (size_t)(blockC_height + 31) / 32; + + size_t local[2]; + local[0] = 8; + local[1] = 1; + + cl_uint arg_idx = 0; + if (is_image_a) + oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A)); + else + oclk_gemm_float.set(arg_idx++, ImA); + + if (TransB == CblasNoTrans || is_image_b || (K % use_buffer_indicator != 0)) + { + if (is_image_b) + oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B)); + else + oclk_gemm_float.set(arg_idx++, ImB); + } else { + oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B)); + oclk_gemm_float.set(arg_idx++, blockB_offset); + oclk_gemm_float.set(arg_idx++, ldB); + } + oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C)); + oclk_gemm_float.set(arg_idx++, blockC_offset); + oclk_gemm_float.set(arg_idx++, blockC_height); + oclk_gemm_float.set(arg_idx++, blockC_width); + oclk_gemm_float.set(arg_idx++, ldC); + oclk_gemm_float.set(arg_idx++, alpha); + oclk_gemm_float.set(arg_idx++, beta); + oclk_gemm_float.set(arg_idx++, padded_k); + if (TransB != CblasNoTrans) + oclk_gemm_float.set(arg_idx++, block_Ksize); + oclk_gemm_float.set(arg_idx++, isFirstColBlock); + + if (!oclk_gemm_float.run(2, global, local, false)) + return false; + + if (TransA == CblasNoTrans) + A_start_x += blockA_width; + else + A_start_y += blockA_height; + + if (TransB == CblasNoTrans) + B_start_y += blockB_height; + else + B_start_x += blockB_width; + + isFirstColBlock = 0; + } + + C_start_x += blockC_width; + if (TransA == CblasNoTrans) + A_start_x = 0; + else + A_start_y = 0; + if (TransB == CblasNoTrans) + { + B_start_x += blockB_width; + B_start_y = 0; + } else { + B_start_y += blockB_height; + B_start_x = 0; + } + if (C_start_x >= N) + { + C_start_x = 0; + B_start_x = 0; + B_start_y = 0; + C_start_y += blockC_height; + if (TransA == CblasNoTrans) + A_start_y += blockA_height; + else + A_start_x += blockA_width; + } + } + + return true; +} + +template +bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB, + const int32_t M, const int32_t N, const int32_t K, + const UMat A, const UMat B, + const UMat B_image, UMat C, + const size_t max_image_size) +{ + gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1; + + if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || + gemm_type == GEMM_TYPE_FAST_IMAGE_32_2) + { + return ocl4dnnFastImageGEMM(CblasNoTrans, TransB, M, N, K, + (Dtype)1., A, 0, B, 0, (Dtype)0., C, + 0, false, false, gemm_type, max_image_size); + } + else if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) + { + return ocl4dnnFastImageGEMM(CblasNoTrans, TransB, M, N, K, + (Dtype)1., A, 0, B_image, 0, (Dtype)0., C, + 0, false, true, + GEMM_TYPE_FAST_IMAGE_B_IMAGE, + max_image_size); + } + return false; +} + +template bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB, + const int32_t M, const int32_t N, const int32_t K, + const UMat A, const UMat B, + const UMat B_image, UMat C, + const size_t max_image_size); + +template +bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA, + const int32_t M, const int32_t N, const Dtype alpha, + const UMat A, const int32_t offA, const UMat x, + const int32_t offx, const Dtype beta, UMat y, + const int32_t offy) +{ + return false; +} + +template<> +bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA, + const int32_t M, const int32_t N, const float alpha, + const UMat A, const int32_t offA, const UMat x, + const int32_t offx, const float beta, UMat y, + const int32_t offy) +{ + ocl::Queue queue = ocl::Queue::getDefault(); + bool ret = false; + + if (TransA == CblasNoTrans) + { + ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc); + if (k.empty()) + return false; + + uint row_size = M; + uint col_size = N; + size_t localsize[] = { 128 }; + size_t globalsize[] = { row_size / 4 * localsize[0] }; + + uint argId = 0; + k.set(argId++, ocl::KernelArg::PtrReadOnly(A)); + k.set(argId++, offA); + k.set(argId++, cl_uint(col_size)); + k.set(argId++, cl_uint(col_size%4)); + k.set(argId++, ocl::KernelArg::PtrReadOnly(x)); + k.set(argId++, offx); + k.set(argId++, alpha); + k.set(argId++, beta); + k.set(argId++, ocl::KernelArg::PtrWriteOnly(y)); + k.set(argId++, offy); + k.set(argId++, NULL, localsize[0] * sizeof(cl_float4)); + + ret = k.run(1, globalsize, localsize, false); + + if ((row_size % 4) != 0 && ret) + { + ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc); + size_t localsize[] = { 128 }; + size_t globalsize[] = { row_size % 4 * localsize[0] }; + uint row_offset = row_size - (row_size % 4); + + uint argId = 0; + k_1.set(argId++, ocl::KernelArg::PtrReadOnly(A)); + k_1.set(argId++, offA); + k_1.set(argId++, cl_uint(col_size)); + k_1.set(argId++, cl_uint(row_offset)); + k_1.set(argId++, cl_uint(col_size%4)); + k_1.set(argId++, ocl::KernelArg::PtrReadOnly(x)); + k_1.set(argId++, offx); + k_1.set(argId++, alpha); + k_1.set(argId++, beta); + k_1.set(argId++, ocl::KernelArg::PtrWriteOnly(y)); + k_1.set(argId++, offy); + k_1.set(argId++, NULL, localsize[0] * sizeof(cl_float)); + + ret = k_1.run(1, globalsize, localsize, false); + } + } + return ret; +} + +template +bool ocl4dnnAXPY(const int32_t N, const Dtype alpha, + const UMat X, const int32_t offX, UMat Y, + const int32_t offY) +{ + ocl::Context ctx = ocl::Context::getDefault(); + + ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc); + if (oclk_axpy.empty()) + return false; + + size_t global[] = { 128 * 128 }; + size_t local[] = { 128 }; + + cl_uint argIdx = 0; + oclk_axpy.set(argIdx++, N); + oclk_axpy.set(argIdx++, alpha); + oclk_axpy.set(argIdx++, ocl::KernelArg::PtrReadOnly(X)); + oclk_axpy.set(argIdx++, offX); + oclk_axpy.set(argIdx++, ocl::KernelArg::PtrWriteOnly(Y)); + oclk_axpy.set(argIdx++, offY); + + return oclk_axpy.run(1, global, local, false); +} + +template bool ocl4dnnAXPY(const int32_t N, const float alpha, + const UMat X, const int32_t offX, + UMat Y, const int32_t offY); + +#endif // HAVE_OPENCL + +} // namespace ocl4dnn +} // namespace dnn +} // namespce cv diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp new file mode 100644 index 0000000000..13d5afb165 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -0,0 +1,1568 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" + +#include + +#include +#include +#include +#include +#include +#include "common.hpp" +#include "ocl4dnn.hpp" +#include "opencl_kernels_dnn.hpp" +#include "math_functions.hpp" +#include "default_kernel_config.hpp" + +#if defined WIN32 || defined _WIN32 +#include +#include +#endif + +#ifdef HAVE_OPENCL +namespace cv { namespace dnn { namespace ocl4dnn { +static cv::Mutex kernelConfigMutex; +typedef std::map kernel_hash_t; +static kernel_hash_t kernelConfigMap; +static bool defaultConfigLoaded = false; + +template +OCL4DNNConvSpatial::OCL4DNNConvSpatial(OCL4DNNConvConfig config) +{ + bias_term_ = config.bias_term; + int dims = config.in_shape.size(); + int spatial_dims = 2; + + channels_ = config.in_shape[dims - spatial_dims - 1]; + num_output_ = config.out_shape[dims - spatial_dims - 1]; + group_ = config.group; + + prev_kernel_type_ = -1; + tuned_ = false; + + // assumption: spatial dimension is 2. + kernel_h_ = config.kernel.height; + kernel_w_ = config.kernel.width; + pad_h_ = config.pad.height; + pad_w_ = config.pad.width; + stride_h_ = config.stride.height; + stride_w_ = config.stride.width; + dilation_h_ = config.dilation.height; + dilation_w_ = config.dilation.width; + M_ = num_output_ / group_; + height_ = config.in_shape[dims - spatial_dims + 0]; + width_ = config.in_shape[dims - spatial_dims + 1]; + output_h_ = config.out_shape[dims - spatial_dims + 0]; + output_w_ = config.out_shape[dims - spatial_dims + 1]; + bottom_dim_ = channels_ * width_ * height_; + top_dim_ = num_output_ * output_w_ * output_h_; + + cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", ""); + + use_cache_path_ = false; + if (!cache_path_.empty()) + { +#if defined _WIN32 + struct _stat file_stat; + use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 && + ((_S_IFDIR & file_stat.st_mode) != 0); +#else + struct stat file_stat; + use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 && + S_ISDIR(file_stat.st_mode); +#endif + if (!use_cache_path_) + { + static int warn_ = 0; + if (!warn_) + { + std::cerr + << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl + << std::endl; + warn_ = true; + } + } + } + + force_auto_tuning_ = + (use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false)) + || utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false); +} + +template +OCL4DNNConvSpatial::~OCL4DNNConvSpatial() +{ + if (!swizzled_weights_umat.empty()) { + swizzled_weights_umat.release(); + } +} + +template +void OCL4DNNConvSpatial::collectCommonInformation() +{ + addDef("Dtype", "float"); + addDef("Dtype2", "float2"); + addDef("Dtype4", "float4"); + addDef("Dtype8", "float8"); + addDef("Dtype16", "float16"); + addDef("as_Dtype", "as_float"); + addDef("as_Dtype2", "as_float2"); + addDef("as_Dtype4", "as_float4"); + addDef("as_Dtype8", "as_float8"); + addDef("Dtype_ID", (int)CV_32F); + addDef("Dtype_SIZE", (int)sizeof(Dtype)); +} + +typedef enum { + KERNEL_TYPE_INTEL_IDLF = 2, + KERNEL_TYPE_BASIC = 4, + KERNEL_TYPE_GEMM_LIKE = 5 +} ocl4dnnConvSpatialKernelType_t; + +template +void OCL4DNNConvSpatial::setupKernelDetails(int32_t kernelType, + int32_t blockM, + int32_t blockK, + int32_t blockN) +{ + std::string kernelUKey; + int32_t simd_size; + + if (kernelType == KERNEL_TYPE_INTEL_IDLF) { + simd_size = blockN; + kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1); + + // kernel name + kernel_name_ = "IDLF_"; + kernel_name_ += kernelUKey; + if (simd_size == 16) + kernel_name_ += "_SIMD16"; + else + kernel_name_ += "_SIMD8"; + + // options + options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_; + if (clOptionSupport("-cl-no-subgroup-ifp")) + options_ << " -cl-no-subgroup-ifp "; + + // defs + int32_t output_width = output_w_; + int32_t output_height = output_h_; + int32_t output_block_width = blockM; + int32_t output_block_height = blockK; + const int32_t last_block_width = (output_width % output_block_width == 0) ? + output_block_width : output_width % output_block_width; + const int32_t last_block_height = (output_height % output_block_height == 0) ? + output_block_height : output_height % output_block_height; + int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4); + int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_; + int tile_y_stride = (4 * simd_size) / tile_x; + int invec_size = divUp(tile_y, tile_y_stride); + + addDef("SIMD_SIZE", simd_size); + addDef("filter_qualifier", "__global"); + addDef("OUT_BLOCK_WIDTH", output_block_width); + addDef("OUT_BLOCK_HEIGHT", output_block_height); + addDef("LAST_BLOCK_WIDTH", last_block_width); + addDef("LAST_BLOCK_HEIGHT", last_block_height); + addDef("INPUT_DEPTH", channels_ / group_); + addDef("TOTAL_INPUT_DEPTH_SIZE", channels_); + addDef("TOTAL_OUTPUT_DEPTH", num_output_); + addDef("INPUT_START_X", 0); + addDef("INPUT_START_Y", 0); + addDef("INPUT_START_Z", 0); + addDef("NUM_FILTERS", M_); + addDef("OUT_BUFF_OFFSET", 0); + addDef("TILE_X", tile_x); + addDef("TILE_Y", tile_y); + addDef("TILE_Y_STRIDE", tile_y_stride); + addDef("INVEC_SIZE", invec_size); + addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); + addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); + addDef("APPLY_BIAS", bias_term_); + + src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; + } + else if (kernelType == KERNEL_TYPE_BASIC) + { + addDef("KERNEL_BASIC"); + + kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN); + kernel_name_ = "BASIC_"; + kernel_name_ += kernelUKey; + + // opts + options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_; + if (clOptionSupport("-cl-no-subgroup-ifp")) + options_ << " -cl-no-subgroup-ifp "; + + // defs + addDef("CHANNELS", channels_ / group_); + addDef("APPLY_BIAS", bias_term_); + addDef("OUTPUT_Z", M_); + addDef("ZPAR", 1); + + src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; + } + else if (kernelType == KERNEL_TYPE_GEMM_LIKE) + { + simd_size = blockK; + kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN); + + kernel_name_ = "U_GEMM_LIKE_CONV_"; + kernel_name_ += kernelUKey.c_str(); + kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16"; + std::stringstream kernelDef; + kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM; + if (blockK == 16) + kernelDef << "_SIMD16"; + + // Build list of options and defines + options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str() + << " -D Conv_Interleaved=" << kernel_name_.c_str(); + options_ << " -cl-mad-enable"; + if (clOptionSupport("-cl-no-subgroup-ifp")) + options_ << " -cl-no-subgroup-ifp "; + + addDef("INPUT_DEPTH", channels_); + addDef("WIDTH1", M_); + addDef("OUT_PADDING_LEFT", 0); + addDef("OUT_PADDING_HEIGHT", 0); + addDef("OUT_DEPTH", M_); + addDef("NUM_BATCHES", num_); + addDef("DY", blockM); + addDef("DX", blockN); + addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2); + addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2); + addDef("TILE_N_LAST", M_ % 32); + addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8); + addDef("APPLY_BIAS", bias_term_); + src_ = ocl::dnn::conv_layer_spatial_oclsrc; + } +} + +template +void OCL4DNNConvSpatial::setupKernel() +{ + collectCommonInformation(); + + addDef("KERNEL_WIDTH", kernel_w_); + addDef("KERNEL_HEIGHT" , kernel_h_); + addDef("STRIDE_X", stride_w_); + addDef("STRIDE_Y", stride_h_); + addDef("DILATION_X", dilation_w_); + addDef("DILATION_Y", dilation_h_); + if (kernelType_ != KERNEL_TYPE_BASIC) + { + addDef("INPUT_PAD_W", pad_w_); + addDef("INPUT_PAD_H", pad_h_); + } + + setupKernelDetails(kernelType_, blockM_, blockK_, blockN_); +} + +template +bool OCL4DNNConvSpatial::Forward(const UMat& bottom, + const UMat& weight, + const UMat& bias, + UMat& top, + int32_t numImages) +{ + num_ = numImages; + + prepareKernel(bottom, top, weight, bias, numImages); + return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault()); +} + +template +void OCL4DNNConvSpatial::calculateBenchmark(const UMat &bottom, UMat &verifyTop, + const UMat &weight, const UMat &bias, + int32_t numImages) +{ + options_.str(""); options_.clear(); // clear contents and state flags + createBasicKernel(1, 1, 1); + kernel_index_ = kernelQueue.size() - 1; + convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_], cv::ocl::Queue::getDefault()); + CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end()); + //unloadProgram(kernelQueue[kernel_index_]->kernelName); + kernelQueue.pop_back(); + return; +} + +#define dbg +#ifdef dbg +#define dbgPrint(x) (x) +#else +#define dbgPrint(x) +#endif + +// For large enough input size, we do not need to tune kernels for different +// size. The reason is with large input size, there will be enough work items +// to feed al the EUs. +// FIXME for the gemm like convolution, switch back to eaxct image size. + +#define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16))) + +template +void OCL4DNNConvSpatial::generateKey() +{ + std::stringstream keyBuilder; + // FIXME: to support fuse? + keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_" + << "cn" << channels_ << "_" + << "g" << group_ << "_" + << "s" << stride_w_ << "x" << stride_h_ << "_" + << "d" << dilation_w_ << "x" << dilation_h_ << "_" + << "b" << bias_term_ << "_" + << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_" + << "p" << pad_w_ << "x" << pad_h_ << "_" + << "num" << num_ << "_" + << "M" << M_; + + key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); + key_sanitized_ = key_; + for (size_t i = 0; i < key_sanitized_.size(); i++) + { + char c = key_sanitized_[i]; + if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')) + { + key_sanitized_[i] = '_'; + } + } + // TODO add hash? + // key_sanitized_ = key_sanitized_ + cv::format("_%08llx", crc64((uchar*)key_.c_str(), key_.size())); + short_key_ = keyBuilder.str(); +} + +template +std::string OCL4DNNConvSpatial::generateSpecificKey(int32_t type, int32_t blockWidth, + int32_t blockHeight, int32_t blockDepth) +{ + std::stringstream keyBuilder; + keyBuilder << short_key_ + << "_" << type + << "_" << blockWidth + << "_" << blockHeight + << "_" << blockDepth; + return keyBuilder.str(); +} + +template +void interleaveMatrix(Dtype* mem_dst, const Dtype *mem, + int r, int c, int interleavedRows, int nonInterleavedRows, + int blockWidth, int rowAlignment ) +{ + CHECK_EQ(interleavedRows % 2, 0) << + "interleaveMatrix only supports even values for interleavedRows."; + + size_t memSize = r * c * sizeof(float); + size_t dstSize = memSize * + (interleavedRows + nonInterleavedRows * 2) / + (interleavedRows + nonInterleavedRows); + memset(mem_dst, 0, dstSize); // NOLINT + + const int xStride = blockWidth; + const int yStride = c * 2; + const Dtype *pSrc = mem; + Dtype* pDst = mem_dst; + for (int y = 0; y < r;) { + for (int rows = 0; rows < interleavedRows; rows += 2) { + if ( y >= r ) break; + if ((c % xStride) == 0) { + for (int x = 0; x < c / xStride; x++) { + memcpy(pDst + x * xStride * 2, // NOLINT + pSrc + x * xStride, xStride * sizeof(Dtype)); + memcpy(pDst + x * xStride * 2 + xStride, // NOLINT + pSrc + x * xStride + c, xStride * sizeof(Dtype)); + } + } else { + const int count = c / xStride; + int x = 0; + for (; x < count - 1; x++) { + memcpy(pDst + x * xStride * 2, // NOLINT + pSrc + x * xStride, xStride * sizeof(Dtype)); + memcpy(pDst + x * xStride * 2 + xStride, // NOLINT + pSrc + x * xStride + c, xStride * sizeof(Dtype)); + } + memcpy(pDst + x * xStride * 2, // NOLINT + pSrc + x * xStride, xStride * sizeof(Dtype)); + } + pSrc += yStride; + pDst += yStride; + y += 2; + } + + for (int rows = 0; rows < nonInterleavedRows; rows++) { + if (y >= r) break; + const int stride = rowAlignment; + int remaining = c; + for (int x = 0; x < c; x += stride) { + if (remaining >= stride) { + memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT + remaining -=stride; + } else { + memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype)); // NOLINT + } + } + pSrc += yStride / 2; + pDst += yStride; + y++; + } + } +} + +template +bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, + int32_t swizzled_factor, + bool interleave) +{ + // Simply skip the weight swizzle if we already got a swizzled_weights_ + // in test phase and not in auto tuning + // This requires we always call convolve again with the winner configuration + // during the auto tuning stage. + if (tuned_ && !swizzled_weights_umat.empty()) + return true; + + if (swizzled_weights_umat.empty()) + swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ * + kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1); + + ocl::Queue queue = ocl::Queue::getDefault(); + if (!interleave) { + cl_uint argIdx = 0; + int32_t channels = channels_ / group_; + + ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"), + cv::ocl::dnn::conv_spatial_helper_oclsrc); + if (oclk_copy_weight.empty()) + return false; + + oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); + oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat)); + oclk_copy_weight.set(argIdx++, kernel_w_); + oclk_copy_weight.set(argIdx++, kernel_h_); + oclk_copy_weight.set(argIdx++, channels); + oclk_copy_weight.set(argIdx++, num_output_); + oclk_copy_weight.set(argIdx++, swizzled_factor); + + size_t global_work_size_copy[3] = { + (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 }; + + if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false)) + { + std::cout << "Swizzle kernel run failed." << std::endl; + return false; + } + } else { + // assumption: kernel dimesion is 2 + Mat weightMat = weight.getMat(ACCESS_READ); + Dtype* cpu_weight = (Dtype *)weightMat.ptr(); + Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE); + Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr(); + + int interleavedRows = (kernel_w_ / 2) * 2; + int nonInterleavedRows = kernel_w_ % 2; + int blockWidth = swizzled_factor; // should equal to simd size. + int rowAlignment = 32; + size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype); + Dtype * tmpSwizzledWeight = reinterpret_cast(malloc(interleaved_filter_size)); + CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight"; + for (int od = 0; od < M_; od++) + for (int id = 0; id < channels_; id++) + for (int r = 0; r < kernel_h_; r++) + for (int c = 0; c < kernel_w_; c++) + tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] = + cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c]; + interleaveMatrix(cpu_swizzled_weight, + tmpSwizzledWeight, + kernel_w_ * kernel_h_ * channels_, M_, + interleavedRows, + nonInterleavedRows, + blockWidth, + rowAlignment); + free(tmpSwizzledWeight); + } + return true; +} + +template<> +bool OCL4DNNConvSpatial::createBasicKernel(int32_t blockWidth, + int32_t blockHeight, int32_t blockDepth) +{ + kernelType_ = KERNEL_TYPE_BASIC; + blockM_ = blockWidth; + blockK_ = blockHeight; + blockN_ = blockDepth; + setupKernel(); + + ocl::Program program = compileKernel(); + if (program.ptr()) + { + int32_t workItemOutput[3] = { 1, 1, 1 }; + size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ }; + kernelQueue.push_back(makePtr(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0], + false, KERNEL_TYPE_BASIC)); + return true; + } + else + return false; +} + +template<> +void OCL4DNNConvSpatial::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer, + int32_t offset, int32_t size, bool write_only) +{ + cl_mem sub_mem; + cl_buffer_region region; + cl_int err; + + region.origin = offset * sizeof(float); + region.size = size * sizeof(float); + sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ), + write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + if (err) + { + std::cout << "Failed to create sub buffer." << std::endl; + return; + } + + int step = sizeof(float), rows = size, cols = 1; + ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer); + + //decrease ocl mem refcount + clReleaseMemObject(sub_mem); +} + +template<> +bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages, kernelConfig* config, + const cv::ocl::Queue& queue) +{ + ocl::Program program; + phash_t::iterator it = phash.find(config->kernelName); + if (it != phash.end()) + program = it->second; + else + return false; + + int32_t bias_offset; + + if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { + if (!swizzleWeight(weight, config->workItem_output[2], false)) + return false; + size_t total_bottom_size = bottom_dim_ * numImages; + size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; + size_t total_bias_size = M_ * group_; + size_t total_top_size = top_dim_ * numImages; + for (int32_t g = 0; g < group_; ++g) { + bias_offset = M_ * g; + int32_t image_offset = width_ * height_ * (channels_ / group_) * g; + int32_t output_image_offset = output_w_ * output_h_ * M_ * g; + int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; + + ocl::Kernel kernel(config->kernelName.c_str(), program); + if (kernel.empty()) + return false; + + cl_uint argIdx = 0; + + UMat img_buffer; + if (image_offset) + { + CreateSubBuffer(bottom, img_buffer, image_offset, + total_bottom_size - image_offset, false); + if (img_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + } + + UMat kernel_buffer; + if (kernel_offset) + { + CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, + total_kernel_size - kernel_offset, false); + if (kernel_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + } + + UMat bias_buffer; + if (bias_term_) + { + if (bias_offset) + { + CreateSubBuffer(bias, bias_buffer, bias_offset, + total_bias_size - bias_offset, false); + if (bias_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + } + } + + UMat out_buffer; + if (output_image_offset) + { + CreateSubBuffer(top, out_buffer, output_image_offset, + total_top_size - output_image_offset, true); + if (out_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + } + + kernel.set(argIdx++, (uint16_t)width_); + kernel.set(argIdx++, (uint16_t)height_); + kernel.set(argIdx++, (uint16_t)output_w_); + kernel.set(argIdx++, (uint16_t)output_h_); + if (!kernel.run(3, config->global_work_size, config->local_work_size, false)) + { + std::cout << "IDLF kernel run failed." << std::endl; + return false; + } + } + } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) { + if (!swizzleWeight(weight, config->workItem_output[1], true)) + return false; + size_t total_bottom_size = bottom_dim_ * numImages; + size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; + size_t total_bias_size = M_ * group_; + size_t total_top_size = top_dim_ * numImages; + for (int32_t g = 0; g < group_; ++g) { + bias_offset = M_ * g; + int32_t image_offset = width_ * height_ * (channels_ / group_) * g; + int32_t output_image_offset = output_w_ * output_h_ * M_ * g; + int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; + + ocl::Kernel kernel(config->kernelName.c_str(), program); + if (kernel.empty()) + return false; + + cl_uint argIdx = 0; + + UMat img_buffer; + if (image_offset) + { + CreateSubBuffer(bottom, img_buffer, image_offset, + total_bottom_size - image_offset, false); + if (img_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + } + + UMat kernel_buffer; + if (kernel_offset) + { + CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, + total_kernel_size - kernel_offset, false); + if (kernel_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + } + + UMat bias_buffer; + if (bias_term_) + { + if (bias_offset) + { + CreateSubBuffer(bias, bias_buffer, bias_offset, + total_bias_size - bias_offset, false); + if (bias_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + } + } + + UMat out_buffer; + if (output_image_offset) + { + CreateSubBuffer(top, out_buffer, output_image_offset, + total_top_size - output_image_offset, true); + if (out_buffer.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); + } + else + { + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + } + + kernel.set(argIdx++, (uint16_t)width_); + kernel.set(argIdx++, (uint16_t)height_); + kernel.set(argIdx++, (uint16_t)output_w_); + kernel.set(argIdx++, (uint16_t)output_h_); + + int out_pitch_y = output_w_ * output_h_; + int out_pitch_z = out_pitch_y * M_; + int aligned_input_size = height_ * width_ * channels_ / group_; + int slice_pitch = width_ * height_; + kernel.set(argIdx++, (uint32_t)out_pitch_y); + kernel.set(argIdx++, (uint32_t)out_pitch_z); + kernel.set(argIdx++, (uint32_t)aligned_input_size); + kernel.set(argIdx++, (uint32_t)slice_pitch); + + int blockM = config->workItem_output[0]; + int blockK = config->workItem_output[1]; + int blockN = config->workItem_output[2]; + int alignedFilterWidth = alignSize(M_, blockN); + int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM); + int globalWorkSizeDX = blockN; + int globalWorkSizeDY = blockM; + size_t sgemm_m = alignedExpandHeight; + size_t sgemm_n = alignedFilterWidth; + size_t gx = divUp(sgemm_n, globalWorkSizeDX); + size_t gy = divUp(sgemm_m, globalWorkSizeDY); + gy = alignSize(gy, blockK); + size_t global_size[3] = { gx, gy, config->global_work_size[2] }; + + if (!kernel.run(3, global_size, config->local_work_size, false)) + { + std::cout << "GEMM like kernel run failed." << std::endl; + return false; + } + } + } else { + for (int32_t n = 0; n < numImages; ++n) { + for (int32_t g = 0; g < group_; ++g) { + bias_offset = M_ * g; + int32_t image_offset = n * bottom_dim_ + + width_ * height_ * (channels_ / group_) * g; + int32_t output_image_offset = n * top_dim_ + + output_w_ * output_h_ * M_ * g; + + cl_uint argIdx = 0; + int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; + + ocl::Kernel kernel(config->kernelName.c_str(), program); + if (kernel.empty()) + return false; + + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + kernel.set(argIdx++, image_offset); + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); + kernel.set(argIdx++, kernel_offset); + if (bias_term_) + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + else + kernel.set(argIdx++, (void *)NULL); + kernel.set(argIdx++, bias_offset); + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, output_image_offset); + kernel.set(argIdx++, (uint16_t)width_); + kernel.set(argIdx++, (uint16_t)height_); + kernel.set(argIdx++, (uint16_t)output_w_); + kernel.set(argIdx++, (uint16_t)output_h_); + kernel.set(argIdx++, (uint16_t)pad_w_); + kernel.set(argIdx++, (uint16_t)pad_h_); + if (!kernel.run(3, config->global_work_size, + (config->use_null_local) ? NULL : config->local_work_size, + false)) + { + std::cout << "Basic kernel run failed." << std::endl; + return false; + } + } + } + } + + return true; +} + +template<> +float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages, kernelConfig* config) +{ + cv::ocl::Queue profilingQueue; + try + { + profilingQueue = cv::ocl::Queue::getDefault().getProfilingQueue(); + } + catch (const cv::Exception&) + { + static int warn_ = 0; + if (!warn_) + { + std::cout << "OpenCV(ocl4dnn): Can't create OpenCL profiling queue for auto-tuning." << std::endl; + warn_ = true; + } + return 1e6; + } + + // warm up. + bool saved_tuned = tuned_; + tuned_ = false; + convolve(bottom, top, weight, bias, numImages, config, profilingQueue); + + cv::ocl::Timer timer(profilingQueue); + timer.start(); + bool res = true;; + dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl); + tuned_ = true; + int loop_cnt = 4; + for (int i = 0; i < loop_cnt; i++) { + res = convolve(bottom, top, weight, bias, numImages, config, profilingQueue); + if (!res) + break; + } + tuned_ = saved_tuned; + timer.stop(); + if (!res) { + config->tested = true; + config->verified = false; + return 1e5; + } + + float elapsedTime = timer.milliSeconds() / loop_cnt; + #ifdef dbg + double out_w = output_w_; + double out_h = output_h_; + double out_z = M_; + double k_w = kernel_w_; + double k_h = kernel_h_; + double k_z = channels_; + double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; + std::cout << "\tEstimated Gflops:" << ((totalFlops/1000)/1000)/1000 + << std::endl; + std::cout << "\tEstimated GFLOPS/S: " << (((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime) + << std::endl; + #if 0 + std::cout << "Estimated utilization: " << + ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0 + << std::endl; + #endif + #endif + return elapsedTime; +} + +template<> +bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImages, + kernelConfig* config, + UMat &verifyTop) +{ + + uint32_t verificationFail = 0; + + if (config->verified) + return true; + else if (config->tested) + return false; + + int32_t sz[4] = {numImages, num_output_, output_h_, output_w_}; + top.zeros(4, sz, CV_32FC1); + bool saved_tuned = tuned_; + tuned_ = false; + convolve(bottom, top, weight, bias, numImages, config, cv::ocl::Queue::getDefault()); + tuned_ = saved_tuned; + + float *data = (float *)top.getMat(ACCESS_READ).ptr(); + float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr(); + + for (int32_t n = 0; n < num_; ++n) { + for (int32_t g = 0; g < group_; ++g) { + int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g; + for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++) + for (int h = 0; h < output_h_ && !verificationFail; h++) + for (int w = 0; w < output_w_; w++) { + size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; + if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) && + !(fabs(verify_data[offset]) < 1.e-3 && + fabs(data[offset] - verify_data[offset]) < 1.e-4)) + { + dbgPrint(printf("test verification failed @ image %d group %d" + "out_ch %d h %d w %d got %G expected %G\n", + n, g, out_ch, h, w, data[offset], verify_data[offset])); + verificationFail = 1; + goto out; + } + } + } + } +out: + if (verificationFail == 1) + return false; + else + return true; +} + +template +void OCL4DNNConvSpatial::unloadProgram(const std::string& kernelName) +{ + ocl::Program program; + phash_t::iterator it = phash.find(kernelName); + if (it != phash.end()) + { + program = it->second; + it->second = ocl::Program(); + } + else + return; + + ocl::Context ctx = ocl::Context::getDefault(); + ctx.unloadProg(program); +} + +template +ocl::Program OCL4DNNConvSpatial::compileKernel() +{ + phash_t::iterator it = phash.find(kernel_name_); + if (it != phash.end()) + { + return it->second; + } + + String errmsg; + ocl::Context ctx = ocl::Context::getDefault(); + std::string options = options_.str(); + CV_Assert(options.size() != 0); + ocl::Program program = ctx.getProg(src_, options, errmsg); + + phash.insert(std::pair(kernel_name_, program)); + if (!program.ptr()) + { + std::cout << "Failed to compile kernel: " << kernel_name_ + << ", buildflags: " << options + << ", errmsg: " << errmsg << std::endl; + } + return program; +} + +template<> +bool OCL4DNNConvSpatial::createGEMMLikeConvKernel(int32_t blockM, + int32_t blockK, + int32_t blockN) +{ + int32_t simd_size = blockK; + + int workItemOutput[3] = { blockM, blockK, blockN }; + size_t gx = (size_t)divUp(M_, blockN); + size_t gy = (size_t)divUp(output_w_ * output_h_, blockM); + gy = alignSize(gy, simd_size); + size_t gz = num_; + size_t global_size[3] = { gx, gy, gz }; + size_t local_size[3] = { 1, static_cast(simd_size), 1 }; + + kernelType_ = KERNEL_TYPE_GEMM_LIKE; + blockM_ = blockM; + blockK_ = blockK; + blockN_ = blockN; + setupKernel(); + + ocl::Program program = compileKernel(); + if (program.ptr()) + { + size_t workgroupSize_used; + ocl::Kernel kernel(kernel_name_.c_str(), program); + if (kernel.empty()) + return false; + + workgroupSize_used = kernel.preferedWorkGroupSizeMultiple(); + if (workgroupSize_used != simd_size) + { + std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl; + std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl; + std::cerr << " Skip this kernel " << kernel_name_ << std::endl; + unloadProgram(kernel_name_); + return false; + } + else + { + kernelQueue.push_back(makePtr(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0], + true, KERNEL_TYPE_GEMM_LIKE)); + return true; + } + } + else + return false; +} + +template<> +bool OCL4DNNConvSpatial::setupIDLF(int32_t blockWidth, + int32_t blockHeight, + int32_t simd_size) +{ + int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size }; + const int32_t num_output_maps = M_; + int32_t output_width = output_w_; + int32_t output_height = output_h_; + int32_t output_block_width = blockWidth; + int32_t output_block_height = blockHeight; + int32_t num_batches = num_; + + size_t global_size[3] = { + (size_t)divUp(output_width, output_block_width), + (size_t)divUp(output_height, output_block_height), + (size_t)num_batches * alignSize(num_output_maps, simd_size) }; + size_t local_size[3] = { 1, 1, static_cast(simd_size) }; + + kernelType_ = KERNEL_TYPE_INTEL_IDLF; + blockM_ = blockWidth; + blockK_ = blockHeight; + blockN_ = simd_size; + + setupKernel(); + + ocl::Program program = compileKernel(); + if (program.ptr()) + { + size_t workgroupSize_used; + ocl::Kernel kernel(kernel_name_.c_str(), program); + if (kernel.empty()) + return false; + + workgroupSize_used = kernel.preferedWorkGroupSizeMultiple(); + if (workgroupSize_used != simd_size) + { + std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl; + std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl; + std::cerr << " Skip this kernel " << kernel_name_ << std::endl; + unloadProgram(kernel_name_); + return false; + } + else + { + kernelQueue.push_back(makePtr(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0], + true, KERNEL_TYPE_INTEL_IDLF)); + return true; + } + } + else + return false; +} + +template<> +bool OCL4DNNConvSpatial::createConvolutionKernel(int32_t kernelType, + int32_t blockWidth, + int32_t blockHeight, + int32_t blockDepth) +{ + kernelType_ = kernelType; + options_.str(""); options_.clear(); // clear contents and state flags + src_ = ocl::ProgramSource(); + + if (kernelType == KERNEL_TYPE_INTEL_IDLF) + return setupIDLF(blockWidth, blockHeight, blockDepth); + else if (kernelType == KERNEL_TYPE_BASIC) + return createBasicKernel(blockWidth, blockHeight, blockDepth); + else if (kernelType == KERNEL_TYPE_GEMM_LIKE) + return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth); + else + CV_Assert(0 && "Internal error"); + return false; +} + +template<> +void OCL4DNNConvSpatial::generateTunerItems(std::vector< cv::Ptr > &tunerItems) +{ + if (ocl::Device::getDefault().intelSubgroupsSupport()) { + /* IDLF kernels are using Intel specific extension which make + them intel only. */ + // Generates static key_ + int max_compute_units = ocl::Device::getDefault().maxComputeUnits(); + int kernelCnt = 0; + if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) { + tunerItems.push_back(makePtr(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32)); + tunerItems.push_back(makePtr(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32)); + + if (kernel_w_ < 4 && M_ % 32 == 0) + tunerItems.push_back(makePtr(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32)); + } + + for (int simd_size = 8; simd_size <= 16; simd_size += 8) { + if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0))) + continue; + if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0)) + continue; + const int width_max = 14, height_max = 8, block_size_max = 32; + for (uint32_t width = width_max; width > 0; width--) { + int candidate = 0; + if (width > output_w_) + continue; + for (uint32_t height = height_max; height > 0; height--) { + if (width * height > block_size_max || height > output_h_) + continue; + // Only when the work items count is less than the device + // max work items or the M_ is less than 16, we will tune + // for simd 8. + if (simd_size == 8 && + M_ >= 16 && + ((num_ * M_ * output_w_ * output_h_ / static_cast(width * height)) >= + max_compute_units * 7 * 16)) + continue; + int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_; + int tile_x = alignSize(actual_tile_x, 4); + int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_; + if (tile_x > (4 * simd_size)) + continue; + // If actual_tile_x is multiple of 4, we may waste some IO bandwidth. + // This could reduce 75% tuning candidates. It has slightly performance + // impact for the final tuning result, less than 2% for most cases. + if (actual_tile_x % 4 != 0) + continue; + if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max) + continue; + int tile_y_stride = (4 * simd_size) / tile_x; + + if (divUp(tile_y, tile_y_stride) < 4) { + tunerItems.push_back(makePtr(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size)); + candidate++; + } + if (candidate >= 4 && height == 2) + break; + } + kernelCnt += candidate; + if (kernelCnt >= 12 && width == 2) + break; + } + } + } +} + +template<> +void OCL4DNNConvSpatial::useFirstAvailable(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImages, + UMat &verifyTop) +{ + std::vector< cv::Ptr > tunerItems; + generateTunerItems(tunerItems); + tunerItems.push_back(makePtr(KERNEL_TYPE_BASIC, 1, 1, 1)); + + for (int i = 0; i < tunerItems.size(); i++) { + if (createConvolutionKernel(tunerItems[i]->kernelType, + tunerItems[i]->blockWidth, + tunerItems[i]->blockHeight, + tunerItems[i]->blockDepth)) { + int kernelIdx = kernelQueue.size() - 1; + if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) { + bestKernelConfig = kernelQueue[kernelIdx]; + if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && + bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE) + if (!swizzled_weights_umat.empty()) + swizzled_weights_umat.release(); + + for (int32_t j = 0; j < kernelIdx; j++) { + CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end()); + unloadProgram(kernelQueue[j]->kernelName); + } + kernelQueue.clear(); + tuned_ = true; + break; + } + } + } +} + +template<> +void OCL4DNNConvSpatial::cacheTunedConfig() +{ + if (tuned_) + { + cv::AutoLock lock(kernelConfigMutex); + std::stringstream outputKernel; + outputKernel << bestKernelConfig->workItem_output[0] << " " + << bestKernelConfig->workItem_output[1] << " " + << bestKernelConfig->workItem_output[2] << " " + << bestKernelConfig->kernelType << " " + << bestKernelConfig->local_work_size[0] << " " + << bestKernelConfig->local_work_size[1] << " " + << bestKernelConfig->local_work_size[2] << " " + << bestKernelConfig->swizzle_weights << " " + << bestKernelConfig->use_null_local << " "; + kernelConfigMap.insert(std::pair(key_, outputKernel.str())); + } +} + +template<> +void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, + UMat &top, + const UMat &weight, + const UMat &bias, + int32_t numImages, + UMat &verifyTop) +{ + std::vector< cv::Ptr > tunerItems; + + generateTunerItems(tunerItems); + for (int i = 0; i < tunerItems.size(); i++) + createConvolutionKernel(tunerItems[i]->kernelType, + tunerItems[i]->blockWidth, + tunerItems[i]->blockHeight, + tunerItems[i]->blockDepth); + + for (int32_t x = 0; x < kernelQueue.size(); x++) { + kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages, + kernelQueue[x]); + #ifdef TEST_ALL_KERNELS + if (kernelQueue[x]->tested == false) { + bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); + if (verified == false) { + dbgPrint(std::cout << "Kernel " + << kernelQueue[x]->kernelName + << " failed verification" << std::endl); + dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: " + << kernelQueue[x]->workItem_output[0] << " " + << "kernelQueue[x]->workItem_output[1]: " + << kernelQueue[x]->workItem_output[1] << " " + << "kernelQueue[x]->workItem_output[2]: " + << kernelQueue[x]->workItem_output[2] << " " + << "kernelQueue[x]->kernelType: " + << kernelQueue[x]->kernelType << " " + << "kernelQueue[x]->global_work_size[0]: " + << kernelQueue[x]->global_work_size[0] << " " + << "kernelQueue[x]->global_work_size[1]: " + << kernelQueue[x]->global_work_size[1] << " " + << "kernelQueue[x]->global_work_size[2]: " + << kernelQueue[x]->global_work_size[2] << " " + << "kernelQueue[x]->local_work_size[0]: " + << kernelQueue[x]->local_work_size[0] << " " + << "kernelQueue[x]->local_work_size[1]: " + << kernelQueue[x]->local_work_size[1] << " " + << "kernelQueue[x]->local_work_size[2]: " + << kernelQueue[x]->local_work_size[2] << " " + << kernelQueue[x]->swizzle_weights << " " + << kernelQueue[x]->use_null_local << std::endl); + } else { + dbgPrint(std::cout << "Kernel " + << kernelQueue[x]->kernelName + << " pass verification" << std::endl); + } + } + #endif + } + int32_t failures = 0; + bool verification = false; + if (kernelQueue.size()) { + while (failures < kernelQueue.size()) { + int32_t fastestKernel = -1; + float fastestTime = std::numeric_limits::infinity(); + + for (int32_t x = 0; x < kernelQueue.size(); x++) { + if (kernelQueue[x]->executionTime < fastestTime && + kernelQueue[x]->tested == false) { + fastestKernel = x; + fastestTime = kernelQueue[x]->executionTime; + } + } + if (fastestKernel < 0) break; + // Test fastest kernel + bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop); + if (verified == true) { + kernelQueue[fastestKernel]->verified = true; + kernel_index_ = fastestKernel; + verification = true; + break; + } else { + kernelQueue[fastestKernel]->tested = true; + dbgPrint(std::cout << "Kernel " << + kernelQueue[fastestKernel]->kernelName << + " failed verification" << std::endl); + failures++; + } + } + } + if (verification) { + dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName << + "> passed verification" << std::endl); + dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl); + } else { + dbgPrint(std::cout << "fallback to basic kernel" << std::endl); + options_.str(""); options_.clear(); // clear contents and state flags + createBasicKernel(1, 1, 1); + kernel_index_ = kernelQueue.size() - 1; + } + this->bestKernelConfig = kernelQueue[kernel_index_]; + + + if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE) + if (!swizzled_weights_umat.empty()) + swizzled_weights_umat.release(); + + for (int32_t x = 0; x < kernelQueue.size(); x++) { + if (x != kernel_index_) { + CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end()); + unloadProgram(kernelQueue[x]->kernelName); + } + } + kernelQueue.clear(); + tuned_ = true; + saveTunedConfig(); +} + +template +void OCL4DNNConvSpatial::saveTunedConfig() +{ + CV_Assert(tuned_); + if (!use_cache_path_ || cache_path_.empty()) + return; + + std::string outputFile; + outputFile = cache_path_ + "/" + key_sanitized_; + std::ofstream outputKernel; + outputKernel.open(outputFile.c_str()); + outputKernel << bestKernelConfig->workItem_output[0] << " " + << bestKernelConfig->workItem_output[1] << " " + << bestKernelConfig->workItem_output[2] << " " + << bestKernelConfig->kernelType << " " + << bestKernelConfig->local_work_size[0] << " " + << bestKernelConfig->local_work_size[1] << " " + << bestKernelConfig->local_work_size[2] << " " + << bestKernelConfig->swizzle_weights << " " + << bestKernelConfig->use_null_local << " "; + outputKernel.close(); +} + +template +void OCL4DNNConvSpatial::prepareKernel(const UMat &bottom, UMat &top, + const UMat &weight, const UMat &bias, + int32_t numImages) +{ + std::string previous_key = key_; + + generateKey(); + if (key_.compare(previous_key) == 0 && bestKernelConfig != NULL) + return; + + if (bestKernelConfig) + { + prev_kernel_type_ = bestKernelConfig->kernelType; + CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end()); + phash.erase(bestKernelConfig->kernelName); + bestKernelConfig.release(); + } + + if (loadCachedConfig()) // check in-memory cache + return; + if (loadTunedConfig()) // check external storage + return; + + UMat benchData(1, numImages * top_dim_, CV_32FC1); + if (force_auto_tuning_) + { + calculateBenchmark(bottom, benchData, weight, bias, numImages); + setupConvolution(bottom, top, weight, bias, numImages, benchData); + } + else + { + calculateBenchmark(bottom, benchData, weight, bias, numImages); + useFirstAvailable(bottom, top, weight, bias, numImages, benchData); + } + cacheTunedConfig(); +} + +template +bool OCL4DNNConvSpatial::loadCachedConfig() +{ + cv::AutoLock lock(kernelConfigMutex); + if (!defaultConfigLoaded) + { + const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2; + for (size_t i = 0; i < numConfigs; i++) + { + std::pair entry( + std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i], + default_kernel_config_intel[2 * i + 1]); + kernelConfigMap.insert(entry); + } + defaultConfigLoaded = true; + } + + kernel_hash_t::iterator it = kernelConfigMap.find(key_); + if (it != kernelConfigMap.end()) + { + int32_t x, y, z, type, lx, ly, lz; + bool swizzle, nullLocal; + std::stringstream cachedKernel(it->second); + if (cachedKernel) + { + cachedKernel >> x; + cachedKernel >> y; + cachedKernel >> z; + cachedKernel >> type; + cachedKernel >> lx; + cachedKernel >> ly; + cachedKernel >> lz; + cachedKernel >> swizzle; + cachedKernel >> nullLocal; + if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) { + tuned_ = true; + return true; + } + } + } + return false; +} + + +template +bool OCL4DNNConvSpatial::setupKernelByConfig(int x, int y, int z, int type, + int lx, int ly, int lz, + bool swizzle, bool nullLocal) +{ + if (type == KERNEL_TYPE_INTEL_IDLF) + { + if (z == 1) + z = 16; + CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl; + } + kernelQueue.clear(); + createConvolutionKernel(type, x, y, z); + if (kernelQueue.size() != 1) { + std::cerr << "Failed setup kernel by config:" + << " x = " << x + << " y = " << y + << " z = " << z + << " type = " << type + << std::endl; + return false; + } + bestKernelConfig = kernelQueue[0]; + kernelQueue.clear(); + bestKernelConfig->local_work_size[0] = lx; + bestKernelConfig->local_work_size[1] = ly; + bestKernelConfig->local_work_size[2] = lz; + bestKernelConfig->swizzle_weights = swizzle; + bestKernelConfig->use_null_local = nullLocal; + // If kernel type changed to type 2 or 4, we need to reset the swizzled + // weights pointer to invalidate the previous swizzled weights data. + if (prev_kernel_type_ != bestKernelConfig->kernelType && + (bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF || + bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE)) + { + if (!swizzled_weights_umat.empty()) + swizzled_weights_umat.release(); + } + return true; +} + +template +bool OCL4DNNConvSpatial::loadTunedConfig() +{ + if (!use_cache_path_) + { + if (cache_path_.empty() && !force_auto_tuning_) + { + static int warn_ = 0; + if (!warn_) + { + std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl + << " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl; + warn_ = true; + } + } + return false; + } + + int32_t x, y, z, type, lx, ly, lz; + bool swizzle, nullLocal; + + // Find cached kernel configuration from file + std::string cacheFile = cache_path_ + "/" + key_sanitized_; + std::ifstream cachedKernel(cacheFile.c_str()); + if (cachedKernel) + { + cachedKernel >> x; + cachedKernel >> y; + cachedKernel >> z; + cachedKernel >> type; + cachedKernel >> lx; + cachedKernel >> ly; + cachedKernel >> lz; + cachedKernel >> swizzle; + cachedKernel >> nullLocal; + if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) { + tuned_ = true; + return true; + } + } + return false; +} + +template class OCL4DNNConvSpatial; +} // namespace ocl4dnn +} +} +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp new file mode 100644 index 0000000000..b6c1df9908 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp @@ -0,0 +1,108 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include "common.hpp" +#include "ocl4dnn.hpp" +#include "math_functions.hpp" + +#ifdef HAVE_OPENCL +namespace cv { namespace dnn { namespace ocl4dnn { +template +OCL4DNNInnerProduct::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config) +{ + bias_term_ = config.bias_term; + transpose_ = config.transpose; + N_ = num_output_ = config.num_output; + M_ = config.M; + K_ = config.K; + phase_test_ = config.phase_test; + image_copied_ = false; +} + +template +OCL4DNNInnerProduct::~OCL4DNNInnerProduct() +{ +} + +template +bool OCL4DNNInnerProduct::Forward(const UMat& bottom, + const UMat& weight, + const UMat& bias, + UMat& top) +{ + bool ret; + + if (M_ == 1) + { + ret = ocl4dnnGEMV(CblasNoTrans, N_, K_, (Dtype) 1., + weight, 0, bottom, 0, (Dtype) 0., top, 0); + + if (bias_term_ && ret) + ret = ocl4dnnAXPY(N_, 1, bias, 0, top, 0); + + return ret; + } + else + { + ret = false; + size_t max_image_size = std::min(ocl::Device::getDefault().image2DMaxWidth(), + ocl::Device::getDefault().image2DMaxHeight()); + if (M_ <= max_image_size && + N_ <= max_image_size && + K_ <= max_image_size && + cv::traits::Depth::value == CV_32F && + ocl::Device::getDefault().intelSubgroupsSupport()) + { + ret = ocl4dnnGEMMCommon(transpose_ ? CblasNoTrans : CblasTrans, + M_, N_, K_, bottom, weight, UMat(), top, + max_image_size); + } + return ret; + } +} + +template class OCL4DNNInnerProduct; +} // namespace ocl4dnn +} +} +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp new file mode 100644 index 0000000000..6cc65b7189 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp @@ -0,0 +1,126 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include "common.hpp" +#include "ocl4dnn.hpp" +#include "opencl_kernels_dnn.hpp" + +#ifdef HAVE_OPENCL +namespace cv { namespace dnn { namespace ocl4dnn { +template +OCL4DNNLRN::OCL4DNNLRN(OCL4DNNLRNConfig config) +{ + lrn_type_ = config.lrn_type; + phase_test_ = config.phase_test; + size_ = config.local_size; + CHECK_EQ(size_ % 2, 1)<< "LRN only supports odd values for local_size"; + alpha_ = config.alpha; + beta_ = config.beta; + k_ = config.k; + norm_by_size_ = config.norm_by_size; + num_ = config.batch_size; + channels_ = config.channels; + height_ = config.height; + width_ = config.width; +} + +template +bool OCL4DNNLRN::Forward(const UMat& bottom, UMat& top) +{ + bool ret = true; + + if (!ocl::Device::getDefault().intelSubgroupsSupport()) + return false; + + switch (lrn_type_) + { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + ret = crossChannelForward(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + //TODO + //WithinChannelForward(bottom_data, top_data); + ret = false; + break; + default: + ret = false; + LOG(FATAL)<< "Unknown normalization region."; + } + return ret; +} + +template +bool OCL4DNNLRN::crossChannelForward(const UMat& bottom, UMat& top) +{ + ocl::Queue queue = ocl::Queue::getDefault(); + CHECK_EQ(phase_test_, true) << "Only support forward inference."; + + cl_uint argIdx = 0; + int32_t n_threads = num_ * height_ * width_; + size_t global_work_size_[1] = {(size_t)n_threads}; + String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : ""; + ocl::Kernel oclk_lrn_fill; + if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts)) + return false; + + oclk_lrn_fill.set(argIdx++, n_threads); + oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_lrn_fill.set(argIdx++, num_); + oclk_lrn_fill.set(argIdx++, channels_); + oclk_lrn_fill.set(argIdx++, height_); + oclk_lrn_fill.set(argIdx++, width_); + oclk_lrn_fill.set(argIdx++, size_); + int size_norm_factor = norm_by_size_ ? size_ : 1; + oclk_lrn_fill.set(argIdx++, alpha_ / size_norm_factor); + oclk_lrn_fill.set(argIdx++, k_); + oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + oclk_lrn_fill.set(argIdx++, -beta_); + + return oclk_lrn_fill.run(1, global_work_size_, NULL, false); +} + +template class OCL4DNNLRN; +} // namespace ocl4dnn +} +} +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp new file mode 100644 index 0000000000..e0bdf71e67 --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp @@ -0,0 +1,213 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include +#include +#include "common.hpp" +#include "ocl4dnn.hpp" +#include "opencl_kernels_dnn.hpp" + +#ifdef HAVE_OPENCL +namespace cv { namespace dnn { namespace ocl4dnn { +template +OCL4DNNPool::OCL4DNNPool(OCL4DNNPoolConfig config) +{ + int dims = config.in_shape.size(); + int spatial_dims = 2; + + batch_size_ = config.in_shape[0]; + channels_ = config.channels; + pool_method_ = config.pool_method; + + for (int i = 0; i < spatial_dims; ++i) + { + kernel_shape_.push_back(i == 0 ? config.kernel.height : config.kernel.width); + pad_.push_back(i == 0 ? config.pad.height : config.pad.width); + stride_.push_back(i == 0 ? config.stride.height : config.stride.width); + im_in_shape_.push_back(config.in_shape[dims - spatial_dims + i]); + im_out_shape_.push_back(config.out_shape[dims - spatial_dims + i]); + } + + kernel_h_ = kernel_shape_[0]; + kernel_w_ = kernel_shape_[1]; + stride_h_ = stride_[0]; + stride_w_ = stride_[1]; + pad_h_ = pad_[0]; + pad_w_ = pad_[1]; + height_ = im_in_shape_[0]; + width_ = im_in_shape_[1]; + pooled_height_ = im_out_shape_[0]; + pooled_width_ = im_out_shape_[1]; + + count_ = 1; + for (int i = 0; i < config.out_shape.size(); ++i) + { + count_ *= config.out_shape[i]; + } +} + +template +OCL4DNNPool::~OCL4DNNPool() +{ + mask_idx_.release(); +} + +template +bool OCL4DNNPool::Forward(const UMat& bottom, + UMat& top, + UMat& top_mask) +{ + bool ret = true; + ocl::Queue queue = ocl::Queue::getDefault(); + size_t global[] = { 128 * 128 }; + size_t local[] = { 128 }; + cl_uint argIdx = 0; + + // support 2D case + switch (pool_method_) + { + case LIBDNN_POOLING_METHOD_MAX: + { + if (top_mask.empty() && mask_idx_.empty()) + { + mask_idx_.create(1, count_, CV_32FC1); + } + ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"), + cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + + if (oclk_max_pool_forward.empty()) + return false; + + argIdx = 0; + oclk_max_pool_forward.set(argIdx++, count_); + oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_max_pool_forward.set(argIdx++, batch_size_); + oclk_max_pool_forward.set(argIdx++, channels_); + oclk_max_pool_forward.set(argIdx++, height_); + oclk_max_pool_forward.set(argIdx++, width_); + oclk_max_pool_forward.set(argIdx++, pooled_height_); + oclk_max_pool_forward.set(argIdx++, pooled_width_); + oclk_max_pool_forward.set(argIdx++, kernel_h_); + oclk_max_pool_forward.set(argIdx++, kernel_w_); + oclk_max_pool_forward.set(argIdx++, stride_h_); + oclk_max_pool_forward.set(argIdx++, stride_w_); + oclk_max_pool_forward.set(argIdx++, pad_h_); + oclk_max_pool_forward.set(argIdx++, pad_w_); + oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1); + if (mask_idx_.empty()) + oclk_max_pool_forward.set(argIdx++, (void *)NULL); + else + oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_)); + oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask)); + + ret = oclk_max_pool_forward.run(1, global, local, false); + } + break; + case LIBDNN_POOLING_METHOD_AVE: + { + ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), + cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + + if (oclk_ave_pool_forward.empty()) + return false; + + argIdx = 0; + oclk_ave_pool_forward.set(argIdx++, count_); + oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_ave_pool_forward.set(argIdx++, batch_size_); + oclk_ave_pool_forward.set(argIdx++, channels_); + oclk_ave_pool_forward.set(argIdx++, height_); + oclk_ave_pool_forward.set(argIdx++, width_); + oclk_ave_pool_forward.set(argIdx++, pooled_height_); + oclk_ave_pool_forward.set(argIdx++, pooled_width_); + oclk_ave_pool_forward.set(argIdx++, kernel_h_); + oclk_ave_pool_forward.set(argIdx++, kernel_w_); + oclk_ave_pool_forward.set(argIdx++, stride_h_); + oclk_ave_pool_forward.set(argIdx++, stride_w_); + oclk_ave_pool_forward.set(argIdx++, pad_h_); + oclk_ave_pool_forward.set(argIdx++, pad_w_); + oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + + ret = oclk_ave_pool_forward.run(1, global, local, false); + } + break; + case LIBDNN_POOLING_METHOD_STO: + { + ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), + cv::ocl::dnn::ocl4dnn_pooling_oclsrc); + + if (oclk_sto_pool_forward.empty()) + return false; + + argIdx = 0; + oclk_sto_pool_forward.set(argIdx++, count_); + oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_sto_pool_forward.set(argIdx++, batch_size_); + oclk_sto_pool_forward.set(argIdx++, channels_); + oclk_sto_pool_forward.set(argIdx++, height_); + oclk_sto_pool_forward.set(argIdx++, width_); + oclk_sto_pool_forward.set(argIdx++, pooled_height_); + oclk_sto_pool_forward.set(argIdx++, pooled_width_); + oclk_sto_pool_forward.set(argIdx++, kernel_h_); + oclk_sto_pool_forward.set(argIdx++, kernel_w_); + oclk_sto_pool_forward.set(argIdx++, stride_h_); + oclk_sto_pool_forward.set(argIdx++, stride_w_); + oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + + ret = oclk_sto_pool_forward.run(1, global, local, false); + } + break; + default: + { + ret = false; + LOG(FATAL)<< "Unknown pooling method."; + } + } + return ret; +} + +template class OCL4DNNPool; +} // namespace ocl4dnn +} +} +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp new file mode 100644 index 0000000000..e4802d2dff --- /dev/null +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp @@ -0,0 +1,135 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "../../precomp.hpp" +#include +#include "common.hpp" +#include "ocl4dnn.hpp" +#include "opencl_kernels_dnn.hpp" + +#ifdef HAVE_OPENCL +namespace cv { namespace dnn { namespace ocl4dnn { +template +OCL4DNNSoftmax::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config) +{ + softmax_axis_ = config.axis; + channels_ = config.channels; + + inner_num_ = 1; + outer_num_ = 1; + count_ = 1; + int32_t scale_sz = 1; + for (int32_t i = softmax_axis_ + 1; i < config.in_shape.size(); i++) + inner_num_ *= config.in_shape[i]; + use_slm_ = (config.in_shape[softmax_axis_] * inner_num_ + inner_num_ * 17) <= 8192; + for (int32_t i = 0; i < softmax_axis_; i++) + outer_num_ *= config.in_shape[i]; + count_ = inner_num_ + outer_num_; + + std::vector scale_dims = config.in_shape; + scale_dims[softmax_axis_] = use_slm_ ? 1 : 17; + for (int32_t i = 0; i < scale_dims.size(); i++) + scale_sz *= scale_dims[i]; + + scale_data_.create(1, scale_sz, CV_32FC1); +} + +template +OCL4DNNSoftmax::~OCL4DNNSoftmax() +{ + scale_data_.release(); +} + +template +bool OCL4DNNSoftmax::Forward(const UMat& bottom, UMat& top) +{ + bool ret = false; + ocl::Queue queue = ocl::Queue::getDefault(); + bool intel_subgroup = ocl::Device::getDefault().intelSubgroupsSupport(); + if (intel_subgroup && inner_num_ < 128) + { + String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : ""; + String kname; + ocl::Kernel oclk_softmax_forward_kernel; + + if (use_slm_) + kname = CL_KERNEL_SELECT("softmax_forward_slm"); + else + kname = CL_KERNEL_SELECT("softmax_forward"); + + if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts)) + return false; + + size_t global_size[] = { 256, (size_t)outer_num_, 1 }; + size_t local_size[] = { 256, 1, 1 }; + cl_uint argIdx = 0; + + if (use_slm_) + { + oclk_softmax_forward_kernel.set(argIdx++, outer_num_); + oclk_softmax_forward_kernel.set(argIdx++, channels_); + oclk_softmax_forward_kernel.set(argIdx++, inner_num_); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_)); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + oclk_softmax_forward_kernel.set(argIdx++, NULL, channels_ * inner_num_* sizeof(Dtype)); + oclk_softmax_forward_kernel.set(argIdx++, NULL, inner_num_* sizeof(Dtype)); + oclk_softmax_forward_kernel.set(argIdx++, NULL, 16 * inner_num_* sizeof(Dtype)); + } + else + { + oclk_softmax_forward_kernel.set(argIdx++, outer_num_); + oclk_softmax_forward_kernel.set(argIdx++, channels_); + oclk_softmax_forward_kernel.set(argIdx++, inner_num_); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_)); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + } + ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false); + } + return ret; +} + +template class OCL4DNNSoftmax; +} // namespace ocl4dnn +} +} +#endif // HAVE_OPENCL diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl index b98e52f674..0649f2e577 100644 --- a/modules/dnn/src/opencl/activations.cl +++ b/modules/dnn/src/opencl/activations.cl @@ -1,3 +1,45 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + __kernel void ReLUForward(const int count, __global const T* in, __global T* out #ifndef RELU_NO_SLOPE , T negative_slope diff --git a/modules/dnn/src/opencl/batchnorm.cl b/modules/dnn/src/opencl/batchnorm.cl new file mode 100644 index 0000000000..3f9401c52e --- /dev/null +++ b/modules/dnn/src/opencl/batchnorm.cl @@ -0,0 +1,26 @@ + +__kernel void batchnorm(__global const T *src, int src_offset, + __global const float *meanMat, + float varMeanScale, + __global const float *invStdMat, + __global const float *weight, + __global const float *bias, + int hasWeight, int hasBias, + int width, int height, int channel, + __global T *dst, int dst_offset) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int c = get_global_id(2); + + if (x >= width || y >= height || c >= channel) + return; + + float mean = meanMat[c] * varMeanScale; + float invstd = invStdMat[c]; + float w = hasWeight ? weight[c] : 1; + float b = hasBias ? bias[c] : 0; + int index = y * width + x + c * width * height; + T val = (src[index + src_offset] - mean) * w * invstd + b; + dst[index + dst_offset] = val; +} diff --git a/modules/dnn/src/opencl/benchmark.cl b/modules/dnn/src/opencl/benchmark.cl new file mode 100644 index 0000000000..22acb93afd --- /dev/null +++ b/modules/dnn/src/opencl/benchmark.cl @@ -0,0 +1,45 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void null_kernel_float(float arg) { + float out = arg; +} diff --git a/modules/dnn/src/opencl/concat.cl b/modules/dnn/src/opencl/concat.cl new file mode 100644 index 0000000000..041e6ac740 --- /dev/null +++ b/modules/dnn/src/opencl/concat.cl @@ -0,0 +1,60 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void concat(const int nthreads, + __global const Dtype* in_data, + const int num_concats, + const int concat_size, + const int top_concat_axis, + const int bottom_concat_axis, + const int offset_concat_axis, + __global Dtype* out_data) { + + for (int index = get_global_id(0); index < nthreads; + index += get_global_size(0)) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + out_data[top_index] = in_data[index]; + } +} diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl new file mode 100644 index 0000000000..a7bca1d6f0 --- /dev/null +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -0,0 +1,1670 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if APPLY_BIAS +#define BIAS_KERNEL_ARG __global Dtype * biases_base, +#else +#define BIAS_KERNEL_ARG +#endif + +#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_) do { (_dst_)[(_offset_)] = (_data_);} while(0) + +#define __CAT(x, y) x##y +#define CAT(x, y) __CAT(x, y) +#define LOOP0(VAR, STMT) +#define LOOP1(VAR, STMT) (STMT); (VAR)++; +#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; +#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; +#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; +#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; +#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; +#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; +#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; +#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; +#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; +#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; +#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; +#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; +#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; +#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; +#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; +#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) + +#if defined(convolve_simd) || defined(Conv_Interleaved) +#if Dtype_SIZE == 4 +#define INT_TYPE uint +#define INT_TYPE2 uint2 +#define INT_TYPE4 uint4 +#define INT_TYPE8 uint8 +#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2 +#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4 +#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8 +#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read +#else +#error "Unsupported type" +#endif +#endif + +#ifdef KERNEL_BASIC + +__kernel void ConvolveBasic( + __global Dtype* image_data, + int image_offset, + __global Dtype* kernel_data, + int kernel_offset, + __global Dtype* bias, + const int bias_offset, + __global Dtype* convolved_image, + const int convolved_image_offset, + const ushort input_width, + const ushort input_height, + const ushort output_width, + const ushort output_height, + const ushort pad_w, + const ushort pad_h +) +{ + const int outputX = get_global_id(0); + const int outputY = get_global_id(1); + const int kernelNum = get_global_id(2) * ZPAR; + if (outputX < output_width && outputY < output_height) + { + Dtype sum[ZPAR]; + for (int kern = 0; kern < ZPAR; kern++) + { + sum[kern] = 0.0f; + } + const int org_y = outputY * STRIDE_Y - pad_h; + const int org_x = outputX * STRIDE_X - pad_w; + const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS; +#if APPLY_BIAS + const int biasIndex = bias_offset + kernelNum; +#endif + const int local_image_offset = org_y * input_width + org_x; + const int imageSize = input_width * input_height; + __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset)); + __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset)); + for (int c = 0; c < CHANNELS; c++) + { + for (int y = 0; y < KERNEL_HEIGHT; y++) + { + for (int x = 0; x < KERNEL_WIDTH; x++) + { + int y_ = org_y + y * DILATION_Y; + int x_ = org_x + x * DILATION_X; + if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)) + { + continue; + } + for (int kern = 0; kern < ZPAR; kern++) + { + sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x]; + } + } + image_dataPtr += input_width * DILATION_Y; + kernel_dataPtr += KERNEL_WIDTH; + } + image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y; + } + + for (int kern = 0; kern < ZPAR; kern++) + { + if (kernelNum + kern < OUTPUT_Z) + { + int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; +#if APPLY_BIAS + ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern]); +#else + ACTIVATION_FUNCTION(convolved_image, offset, sum[kern]); +#endif + } + } + } +} + +#elif defined KERNEL_IDLF + +#if TYPE == TYPE_HALF +#define VLOAD4(_v, _p) do { (_v).s0 = *(_p); (_v).s1 = *(_p + 1); (_v).s2 = *(_p + 2); (_v).s3 = *(_p + 3); } while(0) +#else +#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0) +#endif + +// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. +// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image. +// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH + +// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. +#ifndef __BEIGNET__ +__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) +#endif +__kernel void +convolve_simd( + __global Dtype* inputs_base, + filter_qualifier Dtype* weights_base, + BIAS_KERNEL_ARG + __global Dtype* outputs_base, + const ushort input_width, + const ushort input_height, + const ushort output_width, + const ushort output_height) +{ + __global Dtype* outputs = outputs_base; + __global Dtype* inputs = inputs_base; + filter_qualifier Dtype* weights = weights_base; + unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column + unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row + unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth + unsigned int fmg = get_group_id(2); + unsigned int lid = get_local_id(2); + + Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT]; + + int in_addr; + + // find weights adress of given neuron (lid is index) + unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; + + for(int i=0;i= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { + if (curr_x < INPUT_PAD_W) { + in_buf.in_vec[reg].s0 = 0; + if (curr_x + 1 >= INPUT_PAD_W) + in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); + else + in_buf.in_vec[reg].s1 = 0; + if (curr_x + 2 >= INPUT_PAD_W) + in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); + else + in_buf.in_vec[reg].s2 = 0; + in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); + } else { + VLOAD4(in_buf.in_vec[reg], inputs + in_offset); + if (curr_x + 1 >= input_width + INPUT_PAD_W) + in_buf.in_vec[reg].s1 = 0; + if (curr_x + 2 >= input_width + INPUT_PAD_W) + in_buf.in_vec[reg].s2 = 0; + if (curr_x + 3 >= input_width + INPUT_PAD_W) + in_buf.in_vec[reg].s3 = 0; + } + } else { + in_buf.in_vec[reg] = 0; + } + curr_y += TILE_Y_STRIDE; +#else + VLOAD4(in_buf.in_vec[reg], inputs + in_offset); +#endif + } + in_offset += input_width * TILE_Y_STRIDE; + }); + in_addr += input_height * input_width; +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 + curr_y = saved_y; +#endif + +#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 +#define WEIGHT_PREF 8 +#else +#define WEIGHT_PREF 1 +#endif + union { + Dtype w[WEIGHT_PREF]; +#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 + INT_TYPE8 ui8; +#endif + } weight_buf; + int w_idx=0; + + unsigned int orig_weight_addr = weight_addr; +#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 + weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); + weight_addr += SIMD_SIZE * WEIGHT_PREF; +#else + weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr])); + weight_addr += SIMD_SIZE * 1; +#endif + +#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) + + int kr = 0; // kr = Kernel Row + LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. + { + int kc = 0; // kc = Kernel Column + LOOP(KERNEL_WIDTH, kc, + { + for(int br=0; br < OUT_BLOCK_HEIGHT; br++) { + for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) { + Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X); + out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); + } + } +#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF + // We assume KERNEL_W is equal to KERNEL_H here. + if ((w_idx + 1) % WEIGHT_PREF == 0 + #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0 + && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)) + #endif + ) { + weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); + weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. + } + #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 + // need to do nothing + #else + else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))) + #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1 + weight_buf.w[0] = weights[weight_addr]; + #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2 + weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]); + #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4 + weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]); + #else + weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); + #endif + #endif +#endif + ++w_idx; + }); + }); + weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; + + } + // dead code to work around possible compiler bug. + if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) { + outputs[0] = BLOCK_IN(fm % SIMD_SIZE); + } + fm = fm % ALIGNED_NUM_FILTERS; + + if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { + unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; + out_addr += or * output_width + oc; + // we need this address calculation for biases because we support views and batching +#if APPLY_BIAS + Dtype bias = biases_base[fm]; +#else + Dtype bias = 0; +#endif + for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) { + if (r + or >= output_height) break; + for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) { + if (c + oc >= output_width) break; + // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c]); + + } + } + } +} + +#else // KERNEL_GEMM_LIKE + +#if APPLY_BIAS +// Dtype bias[4]; +#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i) +#else +#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0) +#endif + +#ifdef Conv_Interleaved +typedef struct float1 { float s0; } float1; +typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5; +typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6; +typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7; +typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9; +typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9;} float10; +typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9; float sa;} float11; +typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9; float sa; float sb; } float12; +typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13; +typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14; +typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5; + float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; +typedef struct float0 { float s0; } float0; //never used but makes compiler happy. + +#define OUT_PITCH_X output_width +#define ROW_PITCH input_width + +#define GEMM_LIKE_KERNEL_ARGS \ + const __global Dtype *src0, \ + const __global Dtype *src1, \ + BIAS_KERNEL_ARG \ + __global Dtype *dst, \ + const ushort input_width, \ + const ushort input_height, \ + const ushort output_width, \ + const ushort output_height, \ + const int out_pitch_y, \ + const int out_pitch_z, \ + const int aligned_input_size, \ + const int slice_pitch +#endif + +#ifdef GEMM_LIKE_CONV_32_1 +////////////////////////////////////////////////////////////////////////////// +// Conv_Interleaved_32_1_flex +// +// Convolution: each workitem computes 1 patch x 32 filters worth of output +// data. Kernel's inner loop works on a single tile consisting of one +// row from each patch and the filter data corresponding to that row. Filter +// matrix is interleaved to reduce GRF bank conflicts. Patches are walked +// by rows and then by slices. Relies on sub_group extension for block +// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) +// by dynamically selecting one of two code paths: one uses TILE_N = 32 and +// the other uses TILE_N = 8, 16, or 24. +#define TILE_M 1 +#define TILE_K KERNEL_WIDTH +#define TILE_N 32 + +#ifndef __BEIGNET__ +__attribute__((intel_reqd_sub_group_size(8))) +#endif +__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) +{ + const int group_x = get_group_id(0); + const int group_y = get_group_id(1); + const int global_x = get_global_id(0); + const int global_y = get_global_id(1); + const int global_z = get_global_id(2); + int interleaved_y; + int kernel_y; + int kernel_idx; + +#define DOT_PRODUCT_8( _result, _rowA, colB ) \ + { \ + _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ + _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ + _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ + _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ + _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ + _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ + _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ + _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ + } + typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; + + // True for all threads if filter_width is multiple of TILE_N + // else, true for all but right-most column of threads. + if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) + { + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. + Dtype8 blockC00 = 0.f; + Dtype8 blockC10 = 0.f; + Dtype8 blockC20 = 0.f; + Dtype8 blockC30 = 0.f; + + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + int curr_x = ( global_y % output_width ) * STRIDE_X; + int curr_y = ( global_y / output_width ) * STRIDE_Y; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + int saved_y = curr_y; +#endif + const __global Dtype *src0_read = src0 + + aligned_input_size * global_z // batch offset + + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + + (curr_x - INPUT_PAD_W); // x offset + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); + + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch + // and KERNEL_WIDTH/2 rows of interleaved filter. + int patch_depth = 0; + do + { + int patch_row = 0; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + curr_y = saved_y; +#endif + + do + { + // Load atile and btile. + // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. + // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non + // interleaved row is padded with zero to ensure same size as interleaved rows. This + // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the + // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. + // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. + // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... + // (0, 2) (8, 2) (16, 2) (24, 2) ... ... + // ... + const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; + +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 + Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; + Dtype* pblockA00 = (Dtype*)(&blockA00); +#else + Dtype_t blockA00; + Dtype* pblockA00 = (Dtype*)(&blockA00); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA00[pos] = src0_read[pos * DILATION_X]; + else + pblockA00[pos] = 0; + }) + curr_y += DILATION_Y; +#endif + src0_read += (ROW_PITCH * DILATION_Y); + + Dtype blockB00[KERNEL_WIDTH*4]; + Dtype8* p8BlockB00 = (Dtype8*)blockB00; + Dtype4* p4BlockB00 = (Dtype4*)blockB00; + Dtype* pBlockB00 = (Dtype* )blockB00; + + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) ); + src1_read += WIDTH1 * 2; + } ) + if ( kernel_width_is_odd ) + { + p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) ); + src1_read += WIDTH1 * 2; + } + + // Perform MADs + kernel_idx = 0; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + } ) + kernel_y = interleaved_y * 2; + if ( kernel_width_is_odd ) + { + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < KERNEL_HEIGHT ); + + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT_DEPTH ); + + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + __global Dtype *out = dst + out_offset; +#if APPLY_BIAS + Dtype bias[4]; + Dtype4 *bias_vec; + bias_vec = (Dtype4*)bias; + *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); +#endif +#ifdef FUSED_CONV_CHANNEL_RELU + Dtype slope[4]; + Dtype4 *slope_vec; + slope_vec = (Dtype4*)slope; + *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N)); + Dtype negative_slope; +#endif + if (global_y * TILE_M < output_width * output_height ) + { + for (int i = 0; i < 8; i++) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i)); + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i)); +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i)); +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i)); + } + } + } +#if TILE_N_LAST > 0 + else + { + + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. + int i = 0; + Dtype8 blockC[TILE_N_LAST_DIV8]; + LOOP(TILE_N_LAST_DIV8, i, + { + blockC[i] = 0.f; + } ) + + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + int curr_x = ( global_y % output_width ) * STRIDE_X; + int curr_y = ( global_y / output_width ) * STRIDE_Y; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + int saved_y = curr_y; +#endif + const __global Dtype *src0_read = src0 + + aligned_input_size * global_z // batch offset + + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + + (curr_x - INPUT_PAD_W); // x offset + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); + + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch + // and KERNEL_WIDTH/2 rows of interleaved filter. + int patch_depth = 0; + do + { + int patch_row = 0; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + curr_y = saved_y; +#endif + do + { + // Load atile and interleaved btile. + const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 + Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; + Dtype* pblockA00 = (Dtype*)(&blockA00); +#else + Dtype_t blockA00; + Dtype* pblockA00 = (Dtype*)(&blockA00); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA00[pos] = src0_read[pos * DILATION_X]; + else + pblockA00[pos] = 0; + }) + curr_y += DILATION_Y; +#endif + src0_read += (ROW_PITCH * DILATION_Y); + Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; + + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { +#if TILE_N_LAST_DIV8 == 1 + Dtype2* p2BlockB = (Dtype2* )blockB; + p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 2 + Dtype4* p4BlockB = (Dtype4* )blockB; + p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 3 + //TODO: broken. No block_read6 + Dtype6* p6BlockB = (Dtype6* )blockB; + (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); + (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) ); +#endif + src1_read += WIDTH1 * 2; + } ) + if ( kernel_width_is_odd ) + { +#if TILE_N_LAST_DIV8 == 1 + Dtype* pBlockB = (Dtype* )blockB; + pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 2 + Dtype2* p2BlockB = (Dtype2* )blockB; + p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 3 + Dtype3* p3BlockB = (Dtype3* )blockB; + p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); + p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) ); +#endif + src1_read += WIDTH1 * 2; + } + + // Perform MADs + Dtype* pBlockB = (Dtype*)blockB; + kernel_idx = 0; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 2 + DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 3 + DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#endif +#endif + } ) + kernel_y = interleaved_y * 2; + if ( kernel_width_is_odd ) + { + DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 2 + DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 3 + DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#endif +#endif + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < KERNEL_HEIGHT ); + + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT_DEPTH ); + + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + __global Dtype *out = dst + out_offset; +#if APPLY_BIAS + Dtype bias[4]; + Dtype4 *bias_vec; + bias_vec = (Dtype4*)bias; + *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); +#endif + +#ifdef FUSED_CONV_CHANNEL_RELU + Dtype slope[4]; + Dtype4 *slope_vec; + slope_vec = (Dtype4*)slope; + *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N)); + Dtype negative_slope; +#endif + + if (global_y * TILE_M < output_width * output_height ) + { + for (int i = 0; i < 8; i++) + { + if ( TILE_N_LAST_DIV8 > 0 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i)); + } + if ( TILE_N_LAST_DIV8 > 1 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i)); + } + if ( TILE_N_LAST_DIV8 > 2 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i)); + } + if ( TILE_N_LAST_DIV8 > 3 ) + { + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i)); + } + } + } + } +#endif +} +#endif +#ifdef GEMM_LIKE_CONV_32_2 + +////////////////////////////////////////////////////////////////////////////// +// Conv_Interleaved_32_2_flex +// +// Convolution: each workitem computes 1 patch x 32 filters worth of output +// data. Kernel's inner loop works on a single tile consisting of one +// row from each patch and the filter data corresponding to that row. Filter +// matrix is interleaved to reduce GRF bank conflicts. Patches are walked +// by rows and then by slices. Relies on sub_group extension for block +// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) +// by dynamically selecting one of two code paths: one uses TILE_N = 32 and +// the other uses TILE_N = 8, 16, or 24. +#define TILE_M 2 +#define TILE_K KERNEL_WIDTH +#define TILE_N 32 + +#ifndef __BEIGNET__ +__attribute__((intel_reqd_sub_group_size(8))) +#endif +__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) +{ + const int group_x = get_group_id(0); + const int group_y = get_group_id(1); + const int global_x = get_global_id(0); + const int global_y = get_global_id(1); + const int global_z = get_global_id(2); + int interleaved_y; + int kernel_y; + int kernel_idx; + +#define DOT_PRODUCT_8( _result, _rowA, colB ) \ + { \ + _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ + _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ + _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ + _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ + _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ + _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ + _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ + _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ + } + typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; + + // True for all threads if filter_width is multiple of TILE_N + // else, true for all but right-most column of threads. + if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) + { + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. + Dtype8 blockC00 = 0.f; + Dtype8 blockC10 = 0.f; + Dtype8 blockC20 = 0.f; + Dtype8 blockC30 = 0.f; + Dtype8 blockC01 = 0.f; + Dtype8 blockC11 = 0.f; + Dtype8 blockC21 = 0.f; + Dtype8 blockC31 = 0.f; + + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; + int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; + int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; + int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + int saved_y0 = curr_y0; + int saved_y1 = curr_y1; +#endif + const __global Dtype *src0_read0 = src0 + + aligned_input_size * global_z // batch offset + + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + + curr_x0 - INPUT_PAD_W; // x offset + const __global Dtype *src0_read1 = src0 + + aligned_input_size * global_z // batch offset + + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + + curr_x1 - INPUT_PAD_W; // x offset + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); + + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch + // and KERNEL_WIDTH/2 rows of interleaved filter. + int patch_depth = 0; + do + { + int patch_row = 0; + do + { + // Load atile and btile. + // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. + // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non + // interleaved row is padded with zero to ensure same size as interleaved rows. This + // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the + // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. + // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. + // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... + // (0, 2) (8, 2) (16, 2) (24, 2) ... ... + // ... + const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; +#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 + Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; + Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; + Dtype* pblockA00 = (Dtype*)(&blockA00); + Dtype* pblockA01 = (Dtype*)(&blockA01); +#else + Dtype_t blockA00; + Dtype* pblockA00 = (Dtype*)(&blockA00); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA00[pos] = src0_read0[pos * DILATION_X]; + else + pblockA00[pos] = 0; + }) + curr_y0 += DILATION_Y; + Dtype_t blockA01; + Dtype* pblockA01 = (Dtype*)(&blockA01); + pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA01[pos] = src0_read1[pos * DILATION_X]; + else + pblockA01[pos] = 0; + }) + curr_y1 += DILATION_Y; + src0_read0 += (ROW_PITCH * DILATION_Y); + src0_read1 += (ROW_PITCH * DILATION_Y); +#endif + Dtype blockB00[KERNEL_WIDTH*4]; + Dtype8* p8BlockB00 = (Dtype8*)blockB00; + Dtype4* p4BlockB00 = (Dtype4*)blockB00; + Dtype* pBlockB00 = (Dtype* )blockB00; + + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) ); + src1_read += WIDTH1 * 2; + } ) + if ( kernel_width_is_odd ) + { + p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); + src1_read += WIDTH1 * 2; + } + // Perform MADs + kernel_idx = 0; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + } ) + if ( kernel_width_is_odd ) + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); + DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < KERNEL_HEIGHT ); +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 + curr_y0 = saved_y0; + curr_y1 = saved_y1; +#endif + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT_DEPTH ); + + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + +#if APPLY_BIAS + Dtype bias[4]; + Dtype4 *bias_vec; + bias_vec = (Dtype4*)bias; + *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); +#endif + +#ifdef FUSED_CONV_CHANNEL_RELU + Dtype slope[4]; + Dtype4 *slope_vec; + slope_vec = (Dtype4*)slope; + *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N)); + Dtype negative_slope; +#endif + + if( global_y * TILE_M < output_width * output_height ) + { + for( int i = 0; i < 8; i++ ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i)); +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i)); +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i)); +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i)); + } + } + if( global_y * TILE_M + 1 < output_width * output_height ) + { + for( int i = 0; i < 8; i++ ) + { + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i)); + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i)); + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i)); + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i)); + } + } + } +#if TILE_N_LAST > 0 + else + { + + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. + int i = 0; + Dtype8 blockC0[TILE_N_LAST_DIV8]; + Dtype8 blockC1[TILE_N_LAST_DIV8]; + LOOP(TILE_N_LAST_DIV8, i, + { + blockC0[i] = 0.f; + blockC1[i] = 0.f; + } ) + + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; + int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; + int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; + int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + int saved_y0 = curr_y0; + int saved_y1 = curr_y1; +#endif + const __global Dtype *src0_read0 = src0 + + aligned_input_size * global_z // batch offset + + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + + curr_x0 - INPUT_PAD_W; // x offset + const __global Dtype *src0_read1 = src0 + + aligned_input_size * global_z // batch offset + + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + + curr_x1 - INPUT_PAD_W; // x offset + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); + + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch + // and KERNEL_WIDTH/2 rows of interleaved filter. + int patch_depth = 0; + do + { + int patch_row = 0; + do + { + // Load atile and interleaved btile. + const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; +#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 + Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; + Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; + Dtype* pblockA00 = (Dtype*)(&blockA00); + Dtype* pblockA01 = (Dtype*)(&blockA01); +#else + Dtype_t blockA00; + Dtype* pblockA00 = (Dtype*)(&blockA00); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA00[pos] = src0_read0[pos * DILATION_X]; + else + pblockA00[pos] = 0; + }) + curr_y0 += DILATION_Y; + Dtype_t blockA01; + Dtype* pblockA01 = (Dtype*)(&blockA01); + pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA01[pos] = src0_read1[pos * DILATION_X]; + else + pblockA01[pos] = 0; + }) + curr_y1 += DILATION_Y; + src0_read0 += (ROW_PITCH * DILATION_Y); + src0_read1 += (ROW_PITCH * DILATION_Y); +#endif + Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; + + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { +#if TILE_N_LAST_DIV8 == 1 + Dtype2* p2BlockB = (Dtype2* )blockB; + p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 2 + Dtype4* p4BlockB = (Dtype4* )blockB; + p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 3 + //TODO: broken. No block_read6 + Dtype6* p6BlockB = (Dtype6* )blockB; + (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); + (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) ); +#endif + src1_read += WIDTH1 * 2; + } ) + if ( kernel_width_is_odd ) + { +#if TILE_N_LAST_DIV8 == 1 + Dtype* pBlockB = (Dtype* )blockB; + pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 2 + Dtype2* p2BlockB = (Dtype2* )blockB; + p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); +#elif TILE_N_LAST_DIV8 == 3 + Dtype3* p3BlockB = (Dtype3* )blockB; + p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); + p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) ); +#endif + src1_read += WIDTH1 * 2; + } + + // Perform MADs + Dtype* pBlockB = (Dtype*)blockB; + kernel_idx = 0; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 2 + DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 3 + DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; +#endif +#endif + } ) + kernel_y = interleaved_y * 2; + if ( kernel_width_is_odd ) + { + DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 2 + DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#if TILE_N_LAST_DIV8 >= 3 + DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); + DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; +#endif +#endif + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < KERNEL_HEIGHT ); +#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 + curr_y0 = saved_y0; + curr_y1 = saved_y1; +#endif + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT_DEPTH ); + + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + __global Dtype *out1 = dst + out1_offset; + +#if APPLY_BIAS + Dtype bias[4]; + Dtype4 *bias_vec; + bias_vec = (Dtype4*)bias; + *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); +#endif +#ifdef FUSED_CONV_CHANNEL_RELU + Dtype slope[4]; + Dtype4 *slope_vec; + slope_vec = (Dtype4*)slope; + *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N)); + Dtype negative_slope; +#endif + if( global_y * TILE_M < output_width * output_height ) + { + for( int i = 0; i < 8; i++ ) + { + if ( TILE_N_LAST_DIV8 > 0 ) + { + +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i)); + } + if ( TILE_N_LAST_DIV8 > 1 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i)); + } + if ( TILE_N_LAST_DIV8 > 2 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i)); + } + if ( TILE_N_LAST_DIV8 > 3 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i)); + } + } + } + if( global_y * TILE_M + 1 < output_width * output_height ) + { + for( int i = 0; i < 8; i++ ) + { + if ( TILE_N_LAST_DIV8 > 0 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[0], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i)); + } + if ( TILE_N_LAST_DIV8 > 1 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[1], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i)); + } + if ( TILE_N_LAST_DIV8 > 2 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[2], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i)); + } + if ( TILE_N_LAST_DIV8 > 3 ) + { +#ifdef FUSED_CONV_CHANNEL_RELU + negative_slope = intel_sub_group_shuffle(slope[3], i); +#endif + ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i)); + } + } + } + } +#endif +} +#endif + +#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16) +#ifdef FUSED_CONV_CHANNEL_RELU +#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\ + if (global_y * TILE_M < output_width * output_height ) \ + { \ + if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\ + for (int i = 0; i < 16; i++) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + negative_slope = intel_sub_group_shuffle(slope[1], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else if( ( OUT_DEPTH % 16 ) == 0 ) { \ + if ( ( global_x + 1 ) < get_global_size(0) ) { \ + for ( int i = 0; i < 16; i++ ) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + negative_slope = intel_sub_group_shuffle(slope[1], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + for (int i = 0; i < 16; i++) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + } \ + } \ + else { \ + if ( ( global_x + 1 ) < get_global_size(0) ) \ + { \ + for ( int i = 0; i < 16; i++ ) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + negative_slope = intel_sub_group_shuffle(slope[1], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + if ( (OUT_DEPTH % TILE_N) > 16 ) { \ + for (int i = 0; i < 16 ; i++) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[1], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ + { \ + negative_slope = intel_sub_group_shuffle(slope[0], i); \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + } \ + } \ + } \ + } \ + }while(0) +#else +#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\ + if (global_y * TILE_M < output_width * output_height ) \ + { \ + if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\ + for (int i = 0; i < 16; i++) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else if( ( OUT_DEPTH % 16 ) == 0 ) { \ + if ( ( global_x + 1 ) < get_global_size(0) ) { \ + for ( int i = 0; i < 16; i++ ) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + for (int i = 0; i < 16; i++) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + } \ + } \ + else { \ + if ( ( global_x + 1 ) < get_global_size(0) ) \ + { \ + for ( int i = 0; i < 16; i++ ) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + if ( (OUT_DEPTH % TILE_N) > 16 ) { \ + for (int i = 0; i < 16 ; i++) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \ + } \ + } \ + else { \ + for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ + { \ + ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \ + } \ + } \ + } \ + } \ + } \ + }while(0) +#endif +#endif + +#ifdef GEMM_LIKE_CONV_32_1_SIMD16 +#define TILE_M 1 +#define TILE_K KERNEL_WIDTH +#define TILE_N 32 + +#ifndef __BEIGNET__ +__attribute__((intel_reqd_sub_group_size(16))) +#endif +__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) +{ + const int group_x = get_group_id(0); + const int group_y = get_group_id(1); + const int global_x = get_global_id(0); + const int global_y = get_global_id(1); + const int global_z = get_global_id(2); + int interleaved_y; + int kernel_y; + int kernel_idx; + + // Result ctile (*dst) is M rows x N columns + // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile. + Dtype16 blockC00 = 0.f; + Dtype16 blockC10 = 0.f; + + // Src0 (patch input) is directly used as atile. + // Each work item points to the start of a different patch. + // atile is M rows x K columns. + int curr_x = ( global_y % output_width ) * STRIDE_X; + int curr_y = ( global_y / output_width ) * STRIDE_Y; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + int saved_y = curr_y; +#endif + const __global Dtype *src0_read = src0 + + aligned_input_size * global_z // batch offset + + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + + curr_x - INPUT_PAD_W; // x offset + const __global Dtype *src0_read_orig = src0_read; + + // Src1 (filter) is directly used as btile. + // It starts at the top of src1 and walks down. + // btile is K rows x N columns. + const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 ); + +#define DOT_PRODUCT_16( _result, _rowA, colB ) \ + { \ + _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ + _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ + _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ + _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ + _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ + _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ + _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ + _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ + _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \ + _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \ + _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \ + _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \ + _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \ + _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \ + _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \ + _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \ + } + typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; + // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. + // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch + // and KERNEL_WIDTH/2 rows of interleaved filter. + int patch_depth = 0; +#ifndef __BEIGNET__ + __attribute__((opencl_unroll_hint(1))) +#endif + do + { + int patch_row = 0; +#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 + curr_y = saved_y; +#endif +#ifndef __BEIGNET__ + __attribute__((opencl_unroll_hint(1))) +#endif + do + { + // Load atile and btile. + // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity. + // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non + // interleaved row is padded with zero to ensure same size as interleaved rows. This + // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the + // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. + // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ... + // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ... + // (0, 2) (16, 2) (32, 2) (48, 2) ... ... + // ... + const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; + +#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 + Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; + Dtype* pblockA00 = (Dtype*)(&blockA00); +#else + Dtype_t blockA00; + Dtype* pblockA00 = (Dtype*)(&blockA00); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + pblockA00[pos] = src0_read[pos * DILATION_X]; + else + pblockA00[pos] = 0; + }) + curr_y += DILATION_Y; +#endif + src0_read += ROW_PITCH * DILATION_Y; + INT_TYPE blockB00[KERNEL_WIDTH * 2]; + INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00; + INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00; + Dtype* pBlockB00 = (Dtype*)blockB00; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ); + src1_read += WIDTH1 * 2; + } ) + if ( kernel_width_is_odd ) + { + p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ); + src1_read += WIDTH1 * 2; + } + + // Perform MADs + kernel_idx = 0; + interleaved_y = 0; + LOOP(KERNEL_WIDTH_DIV2, interleaved_y, + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; + } ) + if ( kernel_width_is_odd ) + { + kernel_y = interleaved_y * 2; + DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; + } + } + + //while( ++patch_row < 1 ); //debug + while( ++patch_row < KERNEL_HEIGHT ); + + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + } + //while ( ++patch_depth < 1 ); //debug + while ( ++patch_depth < INPUT_DEPTH ); + + // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: + // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + __global Dtype *out = dst + out_offset; + +#if APPLY_BIAS + Dtype bias[2]; + Dtype2 *bias_vec; + bias_vec = (Dtype2*)bias; + *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); +#endif +#ifdef FUSED_CONV_CHANNEL_RELU + Dtype slope[2]; + Dtype2 *slope_vec; + slope_vec = (Dtype2*)slope; + *slope_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)negative_slope_base + group_x * TILE_N)); + Dtype negative_slope; +#endif + + INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0); +} +#endif +#endif // KERNEL_BASIC/IDLF/GEMM_LIKE diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl new file mode 100644 index 0000000000..9d5a89f7b1 --- /dev/null +++ b/modules/dnn/src/opencl/conv_spatial_helper.cl @@ -0,0 +1,73 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +__kernel void TEMPLATE(copyWeightsSwizzled, Dtype) + (__global Dtype* weightIn, + __global Dtype* weightOut, + const int kernel_w, + const int kernel_h, + const int channels, + const int outputs, + const int swizzleFactor) { + + unsigned int sX = get_global_id(0); + + //Original location + + //Output location + int outputSublayer = channels / swizzleFactor; + int outputSublayerIndex = channels % swizzleFactor; + + int filter = sX / (kernel_w*kernel_h*channels); + int kernel_X = sX % kernel_w; + int kernel_Y = (sX / kernel_w) % kernel_h; + int kernel_C = (sX / (kernel_w * kernel_h)) % channels; + + int FP = filter / swizzleFactor; + int F1 = filter % swizzleFactor; + + weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] + = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; +} diff --git a/modules/dnn/src/opencl/dummy.cl b/modules/dnn/src/opencl/dummy.cl new file mode 100644 index 0000000000..6a55938244 --- /dev/null +++ b/modules/dnn/src/opencl/dummy.cl @@ -0,0 +1,43 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void dummy_kernel() +{ +} diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl new file mode 100644 index 0000000000..37ae523a21 --- /dev/null +++ b/modules/dnn/src/opencl/gemm_image.cl @@ -0,0 +1,635 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) + +// Types used for parameters, offset computations and so on +#define int_tp int +#define uint_tp unsigned int + +#define Dtype float +#define Dtype2 float2 +#define Dtype4 float4 +#define Dtype8 float8 + +#define as_Dtype as_float +#define as_Dtype2 as_float2 +#define as_Dtype4 as_float4 +#define as_Dtype8 as_float8 + +#define KERNEL_ARG_DTYPE float + +#if defined(cl_intel_subgroups) +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#endif + +#define TILE_M 32 +#define TILE_K 8 + +// common block to calculate (alpha * AxB + beta * C) and output to destination image. + +#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord ) +#define SHUFFLE_TYPE2(val) val +#define SHUFFLE_TYPE8(val) val +#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord) +#define SIZE_OF_ELEMENT sizeof(uint) +#define SIMD_SIZE_GEMM 8 +#define TILE_N 8 + +//#define USE_IMAGE_C +#ifdef USE_IMAGE_C +#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) ) +#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) ) +#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst +#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint)) +#else +#define BLOCKC_READ8( _C, _coordC ) \ + (Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \ + (_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0) + +#define BLOCKC_WRITE8( _C, _coordC, _val) do {\ + if (_coordC.x + get_local_id(0) < N) { \ + if (_coordC.y < M) \ + _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \ + if (_coordC.y + 1 < M) \ + _C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \ + if (_coordC.y + 2 < M) \ + _C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \ + if (_coordC.y + 3 < M) \ + _C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \ + if (_coordC.y + 4 < M) \ + _C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \ + if (_coordC.y + 5 < M) \ + _C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \ + if (_coordC.y + 6 < M) \ + _C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \ + if (_coordC.y + 7 < M) \ + _C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \ + }} while(0) +#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc +#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1) +#endif + +#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \ + int2 coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \ + int2 coordC = coordDst; \ + Dtype8 blockC00; \ + Dtype8 blockC01; \ + Dtype8 blockC02; \ + Dtype8 blockC03; \ + if (BETA_NOT0) { \ + blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \ + if (!ALPHA1) { \ + blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \ + blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \ + blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \ + blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \ + } else { \ + blockC00 += blockAxB00; \ + blockC01 += blockAxB01; \ + blockC02 += blockAxB02; \ + blockC03 += blockAxB03; \ + } \ + } else { \ + blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ + blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \ + if (!ALPHA1) { \ + blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \ + blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \ + blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \ + blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \ + } else { \ + blockC00 += blockAxB00; \ + blockC01 += blockAxB01; \ + blockC02 += blockAxB02; \ + blockC03 += blockAxB03; \ + } \ + } \ + BLOCKC_WRITE8( _dst, coordDst, blockC00 ); coordDst.y += 8; \ + BLOCKC_WRITE8( _dst, coordDst, blockC01 ); coordDst.y += 8; \ + BLOCKC_WRITE8( _dst, coordDst, blockC02 ); coordDst.y += 8; \ + BLOCKC_WRITE8( _dst, coordDst, blockC03 ); + +// Get the specified column of the block of the block +#define TRANSPOSE_BLOCK_8( _block, _col ) \ + (Dtype8)( intel_sub_group_shuffle( _block.s0, _col ), \ + intel_sub_group_shuffle( _block.s1, _col ), \ + intel_sub_group_shuffle( _block.s2, _col ), \ + intel_sub_group_shuffle( _block.s3, _col ), \ + intel_sub_group_shuffle( _block.s4, _col ), \ + intel_sub_group_shuffle( _block.s5, _col ), \ + intel_sub_group_shuffle( _block.s6, _col ), \ + intel_sub_group_shuffle( _block.s7, _col ) ); + +// A's column block multiply B 's row block. +#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ + { \ + const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ + const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \ + const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \ + const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \ + const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \ + const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \ + const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \ + const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \ + _result = mad( (Dtype8)(_blockB.s0), acol0, _result ); \ + _result = mad( (Dtype8)(_blockB.s1), acol1, _result ); \ + _result = mad( (Dtype8)(_blockB.s2), acol2, _result ); \ + _result = mad( (Dtype8)(_blockB.s3), acol3, _result ); \ + _result = mad( (Dtype8)(_blockB.s4), acol4, _result ); \ + _result = mad( (Dtype8)(_blockB.s5), acol5, _result ); \ + _result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \ + _result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \ + } + +#define GEMM_NN(ALPHA1, BETA_NOT0) \ +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ +__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ +__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \ + __read_only image2d_t A, \ + __read_only image2d_t B, \ + MATC_PARAMETER, \ + KERNEL_ARG_DTYPE alpha_in, \ + KERNEL_ARG_DTYPE beta_in, \ + int width0, \ + int isFirstColBlock) \ +{ \ + const Dtype alpha = (Dtype)alpha_in; \ + const Dtype beta = (Dtype)beta_in; \ + const int group_x = get_group_id(0); \ + const int group_y = get_group_id(1); \ + Dtype8 blockAxB00 = 0.0f; \ + Dtype8 blockAxB01 = 0.0f; \ + Dtype8 blockAxB02 = 0.0f; \ + Dtype8 blockAxB03 = 0.0f; \ + int2 coordA = (int2)( 0, group_y * TILE_M ); \ + int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \ + do \ + { \ + int2 coordBTemp = coordB; \ + Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \ + int2 coordATemp = coordA; \ + Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \ + MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \ + } \ + while( coordB.y < width0 ); \ + GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ +} + +GEMM_NN(1, 0) // ALPHA == 1, BETA == 0 +GEMM_NN(1, 1) // ALPHA == 1, BETA != 0 +GEMM_NN(0, 0) // ALPHA != 1, BETA == 0 +GEMM_NN(0, 1) // ALPHA != 1, BETA != 0 + +#undef TRANSPOSE_BLOCK_8 +#undef MULTIPLY_BLOCKS_8x8 +#undef GEMM_NN + +// replicate the first row to column block. +#define TRANSPOSE_BLOCK_8(_vec, _col) \ + (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \ + intel_sub_group_shuffle(_vec, _col + 1), \ + intel_sub_group_shuffle(_vec, _col + 2), \ + intel_sub_group_shuffle(_vec, _col + 3), \ + intel_sub_group_shuffle(_vec, _col + 4), \ + intel_sub_group_shuffle(_vec, _col + 5), \ + intel_sub_group_shuffle(_vec, _col + 6), \ + intel_sub_group_shuffle(_vec, _col + 7) ) + +#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \ + { \ + _result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result ); \ + _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \ + } + +#define GEMM_TN(ALPHA1, BETA_NOT0) \ +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ +__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ +__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \ + __read_only image2d_t A, \ + __read_only image2d_t B, \ + MATC_PARAMETER, \ + KERNEL_ARG_DTYPE alpha_in, \ + KERNEL_ARG_DTYPE beta_in, \ + int width0, \ + int isFirstColBlock) \ +{ \ + const Dtype alpha = (Dtype)alpha_in; \ + const Dtype beta = (Dtype)beta_in; \ + const int group_x = get_group_id(0);\ + const int group_y = get_group_id(1);\ + Dtype8 blockAxB00 = 0.0f;\ + Dtype8 blockAxB01 = 0.0f;\ + Dtype8 blockAxB02 = 0.0f;\ + Dtype8 blockAxB03 = 0.0f;\ + int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\ + int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\ + do\ + {\ + int2 coordBTemp = coordB;\ + Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\ + int2 coordATemp = coordA;\ + Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\ + Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\ + Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\ + Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\ + MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \ + } \ + while( coordB.y < width0 ); \ + GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ +} + +GEMM_TN(1, 0) // ALPHA == 1, BETA == 0 +GEMM_TN(1, 1) // ALPHA == 1, BETA != 0 +GEMM_TN(0, 0) // ALPHA != 1, BETA == 0 +GEMM_TN(0, 1) // ALPHA != 1, BETA != 0 + +#undef MULTIPLY_BLOCKS_8x8 +#undef TRANSPOSE_BLOCK_8 +#undef GEMM_TN + +// The same as GEMM_NN +#define TRANSPOSE_BLOCK_8( _block, _col ) \ + (Dtype8)( intel_sub_group_shuffle( _block.s0, _col), \ + intel_sub_group_shuffle( _block.s1, _col), \ + intel_sub_group_shuffle( _block.s2, _col), \ + intel_sub_group_shuffle( _block.s3, _col), \ + intel_sub_group_shuffle( _block.s4, _col), \ + intel_sub_group_shuffle( _block.s5, _col), \ + intel_sub_group_shuffle( _block.s6, _col), \ + intel_sub_group_shuffle( _block.s7, _col) ) + +#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ + { \ + const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ + const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \ + const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \ + const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \ + const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \ + const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \ + const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \ + const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \ + _result = mad( (Dtype8)_blockB.s0, acol0, _result ); \ + _result = mad( (Dtype8)_blockB.s1, acol1, _result ); \ + _result = mad( (Dtype8)_blockB.s2, acol2, _result ); \ + _result = mad( (Dtype8)_blockB.s3, acol3, _result ); \ + _result = mad( (Dtype8)_blockB.s4, acol4, _result ); \ + _result = mad( (Dtype8)_blockB.s5, acol5, _result ); \ + _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \ + _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ + } + +#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ +__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ +__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \ + __read_only image2d_t A, \ + MATB_PARAMETER, \ + MATC_PARAMETER, \ + KERNEL_ARG_DTYPE alpha_in, \ + KERNEL_ARG_DTYPE beta_in, \ + int padded_k, \ + int k, \ + int isFirstColBlock) \ +{ \ + const Dtype alpha = (Dtype)alpha_in; \ + const Dtype beta = (Dtype)beta_in; \ + const int group_x = get_group_id(0); \ + const int group_y = get_group_id(1); \ + Dtype8 blockAxB00 = 0.0f; \ + Dtype8 blockAxB01 = 0.0f; \ + Dtype8 blockAxB02 = 0.0f; \ + Dtype8 blockAxB03 = 0.0f; \ + int2 coordA = (int2)( 0, group_y * TILE_M ); \ + int2 coordB = (int2)( 0, ( group_x * TILE_N )); \ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \ + do \ + { \ + Dtype8 blockB00; \ + BLOCKB_READ8(blockB00, B, coordB); \ + int2 coordATemp = coordA; \ + Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \ + Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \ + MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \ + } \ + while( coordB.x < padded_k / VECSIZE ); \ + GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ +} + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2; + +#define MATB_PARAMETER __read_only image2d_t B + +GEMM_NT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0 +GEMM_NT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0 +GEMM_NT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0 +GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \ + _blockb = vload8(0, B_read); \ + _coordB.x += TILE_K; + +#define MATB_PARAMETER __global Dtype *B, int offB, int ldb + +GEMM_NT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0 +GEMM_NT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0 +GEMM_NT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0 +GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + Dtype4 temp; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s0 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s1 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s2 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s3 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s4 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s5 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s6 = temp.s0; \ + temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s7 = temp.s0; \ + _coordB.x += 8; + +#define MATB_PARAMETER __read_only image2d_t B + +GEMM_NT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0 +GEMM_NT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0 +GEMM_NT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0 +GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#undef MULTIPLY_BLOCKS_8x8 +#undef TRANSPOSE_BLOCK_8 +#undef GEMM_NT + +//The same as GEMM_TN. +#define TRANSPOSE_BLOCK_8(_vec, _col) \ + (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \ + intel_sub_group_shuffle(_vec, _col + 1), \ + intel_sub_group_shuffle(_vec, _col + 2), \ + intel_sub_group_shuffle(_vec, _col + 3), \ + intel_sub_group_shuffle(_vec, _col + 4), \ + intel_sub_group_shuffle(_vec, _col + 5), \ + intel_sub_group_shuffle(_vec, _col + 6), \ + intel_sub_group_shuffle(_vec, _col + 7) ); + +#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \ + { \ + const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col ); \ + const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col ); \ + const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col ); \ + const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col ); \ + const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col ); \ + const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col ); \ + const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col ); \ + const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col ); \ + _result = mad( (Dtype8)_blockB.s0, acol0, _result ); \ + _result = mad( (Dtype8)_blockB.s1, acol1, _result ); \ + _result = mad( (Dtype8)_blockB.s2, acol2, _result ); \ + _result = mad( (Dtype8)_blockB.s3, acol3, _result ); \ + _result = mad( (Dtype8)_blockB.s4, acol4, _result ); \ + _result = mad( (Dtype8)_blockB.s5, acol5, _result ); \ + _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \ + _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ + } + +#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ +__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ +__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ +__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \ + __read_only image2d_t A, \ + MATB_PARAMETER, \ + MATC_PARAMETER, \ + KERNEL_ARG_DTYPE alpha_in, \ + KERNEL_ARG_DTYPE beta_in, \ + int padded_k, \ + int k, \ + int isFirstColBlock) \ +{ \ + const Dtype alpha = (Dtype)alpha_in; \ + const Dtype beta = (Dtype)beta_in; \ + const int group_x = get_group_id(0); \ + const int group_y = get_group_id(1); \ + Dtype8 blockAxB00 = 0.0f; \ + Dtype8 blockAxB01 = 0.0f; \ + Dtype8 blockAxB02 = 0.0f; \ + Dtype8 blockAxB03 = 0.0f; \ + int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \ + int2 coordB = (int2)( 0, ( group_x * TILE_N )); \ + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \ + do \ + { \ + Dtype8 blockB00; \ + BLOCKB_READ8(blockB00, B, coordB); \ + int2 coordATemp = coordA; \ + Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \ + Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \ + Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \ + Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K; \ + MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \ + MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \ + } \ + while( coordB.x < padded_k / VECSIZE ); \ + GEMM_OUTPUT(ALPHA1, BETA_NOT0);\ +} + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2; + +#define MATB_PARAMETER __read_only image2d_t B + +GEMM_TT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0 +GEMM_TT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0 +GEMM_TT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0 +GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \ + _blockb = vload8(0, B_read); \ + _coordB.x += TILE_K; + +#define MATB_PARAMETER __global Dtype *B, int offB, int ldb + +GEMM_TT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0 +GEMM_TT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0 +GEMM_TT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0 +GEMM_TT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#define BLOCKB_READ8(_blockb, _B, _coordB) \ + int2 _coordBTemp = _coordB; \ + _coordBTemp.y += get_local_id(0); \ + Dtype4 temp; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s0 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s1 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s2 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s3 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s4 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s5 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s6 = temp.s0; \ + temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \ + _blockb.s7 = temp.s0; \ + _coordB.x += 8; + +#define MATB_PARAMETER __read_only image2d_t B + +GEMM_TT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0 +GEMM_TT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0 +GEMM_TT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0 +GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0 +#undef BLOCKB_READ8 +#undef MATB_PARAMETER + +#undef MULTIPLY_BLOCKS_8x8 +#undef TRANSPOSE_BLOCK_8 +#undef GEMM_TT + +#undef TILE_M +#undef TILE_K +#undef TILE_N +#undef SUBGROUP_BLOCK_READ8 +#undef READ_IMAGE +#undef SIZE_OF_ELEMENT + +__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)( + __global Dtype* A, + __write_only image2d_t ImA, + int offA, + int width, + int height, + int ldA) +{ + const int gidx = get_global_id(0); + const int gidy = get_global_id(1); + int2 coord_dst = (int2)(gidx, gidy); + __global Dtype* A_off = A + offA; + Dtype srcA = A_off[gidy * ldA + gidx]; + write_imagef(ImA, coord_dst, (Dtype4)srcA); +} + +__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)( + __global Dtype* A, + __write_only image2d_t ImA, + int offA, + int width, + int height, + int ldA) +{ + const int gidx = get_global_id(0); + const int gidy = get_global_id(1); + int2 coord_dst = (int2)(gidx, gidy); + if (gidx >= width || gidy >= height) { + write_imageui(ImA, coord_dst, (uint4)0); + return; + } + __global Dtype* A_off = A + offA; + uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx])); + write_imageui(ImA, coord_dst, srcA); +} diff --git a/modules/dnn/src/opencl/math.cl b/modules/dnn/src/opencl/math.cl new file mode 100644 index 0000000000..b8f4eff010 --- /dev/null +++ b/modules/dnn/src/opencl/math.cl @@ -0,0 +1,55 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x, + const int offx, __global Dtype* y, + const int offy) { + for (int index = get_global_id(0); index < n; index += get_global_size(0)) { + Dtype src = x[offx + index]; + Dtype dst = y[offy + index]; + y[offy + index] = alpha * src + dst; + } +} diff --git a/modules/dnn/src/opencl/matvec_mul.cl b/modules/dnn/src/opencl/matvec_mul.cl new file mode 100644 index 0000000000..0dabd62c54 --- /dev/null +++ b/modules/dnn/src/opencl/matvec_mul.cl @@ -0,0 +1,191 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +__kernel void TEMPLATE(matvec_mul4,Dtype)( + __global const float * A, + int offA, + unsigned int A_col_size, + unsigned int trail_item, + __global const float * v, + int offv, + float alpha, + float beta, + __global float4 * result, + int offr, + __local float4 * work) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + const __global float *src0_read = A + row_gid * 4 * A_col_size + offA; + const __global float *src1_read = v + offv; + result = (__global float4*)((__global float*)result + offr); + float4 dot0 = (float4)(0.f); + float4 dot1 = (float4)(0.f); + float4 dot2 = (float4)(0.f); + float4 dot3 = (float4)(0.f); + + unsigned int i = lid; + while( i < A_col_size / 4) { + const float4 a0 = vload4(i, src0_read); + const float4 a1 = vload4(i, src0_read + A_col_size); + const float4 a2 = vload4(i, src0_read + 2 * A_col_size); + const float4 a3 = vload4(i, src0_read + 3 * A_col_size); + + const float4 b0 = vload4(i, src1_read); + + dot0 += a0 * b0; + dot1 += a1 * b0; + dot2 += a2 * b0; + dot3 += a3 * b0; + + i += get_local_size(0); + } + + work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w; + work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w; + work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w; + work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w; + + if(i == A_col_size / 4) + { + if(trail_item != 0) + { + const __global float *src0_trail = src0_read + i * 4; + const __global float *src1_trail = src1_read + i * 4; + for(unsigned int i = 0; i < trail_item; ++i) { + const float at0 = src0_trail[i]; + const float at1 = src0_trail[i + A_col_size]; + const float at2 = src0_trail[i + 2 * A_col_size]; + const float at3 = src0_trail[i + 3 * A_col_size]; + + const float bt = src1_trail[i]; + + work[lid].s0 += at0 * bt; + work[lid].s1 += at1 * bt; + work[lid].s2 += at2 * bt; + work[lid].s3 += at3 * bt; + } + } + + } + + for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) { + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < stride) + work[lid] += work[lid+stride]; + } + if(lid == 0) { + if(beta == (Dtype)0) + result[row_gid] = alpha * work[0]; + else + result[row_gid] = alpha * work[0] + beta * result[row_gid]; + } +} + +/* This kernel used for the trailing rows when row_of_A %4 !=0 */ +__kernel void TEMPLATE(matvec_mul1,Dtype)( + __global const float * A, + int offA, + unsigned int A_col_size, + unsigned int row_offset, + unsigned int trail_item, + __global const float * v, + int offv, + float alpha, + float beta, + __global float * result, + int offr, + __local float * work) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + + const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA; + const __global float *src1_read = v + + offv; + result = result + offr; + float4 dot0 = (float4)(0.f); + + unsigned int i = lid; + while( i < A_col_size / 4) + { + const float4 a0 = vload4(i, src0_read); + const float4 b0 = vload4(i, src1_read); + + dot0 += a0 * b0; + i += get_local_size(0); + } + + work[lid] = dot0.x + dot0.y + dot0.z + dot0.w; + + if(i == A_col_size / 4) + { + if(trail_item != 0) + { + const __global float *src0_trail = src0_read + i * 4; + const __global float *src1_trail = src1_read + i * 4; + for(unsigned int i = 0; i < trail_item; ++i) { + const float at0 = src0_trail[i]; + const float bt = src1_trail[i]; + + work[lid] += at0 * bt; + } + } + + } + for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) { + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < stride) + work[lid] += work[lid+stride]; + } + + if(lid == 0) { + if(beta == (Dtype)0) { + result[row_gid+row_offset] = alpha * work[0]; + } else { + result[row_gid+row_offset] *= beta; + result[row_gid+row_offset] += alpha * work[0]; + } + } +} diff --git a/modules/dnn/src/opencl/ocl4dnn_lrn.cl b/modules/dnn/src/opencl/ocl4dnn_lrn.cl new file mode 100644 index 0000000000..58477cef0c --- /dev/null +++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl @@ -0,0 +1,96 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in, + const int num, const int channels, + const int height, const int width, const int size, + const Dtype alpha_over_size, const Dtype k, + __global Dtype* const out, + const Dtype negative_beta) { + for (int index = get_global_id(0); index < nthreads; + index += get_global_size(0)) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + __global const Dtype* in_off = in + offset; + __global Dtype* out_off = out + offset; + Dtype scale_val; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + Dtype accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in_off[head * step] * in_off[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in_off[head * step] * in_off[head * step]; + if (head - size >= 0) { + accum_scale -= in_off[(head - size) * step] + * in_off[(head - size) * step]; + } + scale_val = k + accum_scale * alpha_over_size; + out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in_off[(head - size) * step] + * in_off[(head - size) * step]; + } + scale_val = k + accum_scale * alpha_over_size; + out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); + ++head; + } + } +} diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl new file mode 100644 index 0000000000..326d5bc0d6 --- /dev/null +++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl @@ -0,0 +1,177 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +void TEMPLATE(max_pool_forward_impl, Dtype)( + const int nthreads, __global const Dtype* bottom_data, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, + __global Dtype* top_data, + const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask) +{ + for (int index = get_global_id(0); index < nthreads; + index += get_global_size(0)) + { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + Dtype maxval = -FLT_MAX; + int maxidx = -1; + __global const Dtype* bottom_slice = bottom_data + + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_slice[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_slice[maxidx]; + } + } + } + top_data[index] = maxval; + if (!no_mask) { + if (use_mask == 1) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } + } +} + +__kernel void TEMPLATE(max_pool_forward, Dtype)( + const int nthreads, __global const Dtype* bottom_data, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, + __global Dtype* top_data, + const int use_mask, __global int* mask, __global Dtype* top_mask) +{ + TEMPLATE(max_pool_forward_impl, Dtype)( + nthreads, bottom_data, num, channels, height, width, + pooled_height, pooled_width, kernel_h, + kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false + ); +} + +__kernel void TEMPLATE(ave_pool_forward, Dtype)( + const int nthreads, __global const Dtype* const bottom_data, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global Dtype* top_data) +{ + for (int index = get_global_id(0); index < nthreads; + index += get_global_size(0)) + { + { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + hend = min(hend, height); + wend = min(wend, width); + Dtype aveval = 0; + __global const Dtype* bottom_slice = bottom_data + + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_slice[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } + } +} + +__kernel void TEMPLATE(sto_pool_forward_test,Dtype)( + const int nthreads, __global const Dtype* const bottom_data, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + __global Dtype* top_data) +{ + for (int index = get_global_id(0); index < nthreads; + index += get_global_size(0)) + { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems + Dtype cumsum = FLT_MIN; + Dtype cumvalues = 0.; + __global const Dtype* bottom_slice = bottom_data + + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum; + } +} diff --git a/modules/dnn/src/opencl/softmax.cl b/modules/dnn/src/opencl/softmax.cl index e9fcadce39..54cf489501 100644 --- a/modules/dnn/src/opencl/softmax.cl +++ b/modules/dnn/src/opencl/softmax.cl @@ -70,6 +70,10 @@ __kernel void kernel_channel_div(const int count, if(index < count) { int n = index / channels / spatial_dim; int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; + T v = data[index] / channel_sum[n * spatial_dim + s]; +#ifdef LOG_SOFTMAX + v = log(v); +#endif + data[index] = v; } -} \ No newline at end of file +} diff --git a/modules/dnn/src/opencl/softmax_loss.cl b/modules/dnn/src/opencl/softmax_loss.cl new file mode 100644 index 0000000000..d30b32bc69 --- /dev/null +++ b/modules/dnn/src/opencl/softmax_loss.cl @@ -0,0 +1,182 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CONCAT(A,B) A##_##B +#define TEMPLATE(name,type) CONCAT(name,type) +#define Dtype float + +#if defined(cl_intel_subgroups) +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#endif + +__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels, + const int spatial_dim, + __global Dtype* scale, + __global const Dtype* data, + __global Dtype* out, + __local Dtype *out_tmp, + __local Dtype *scale_tmp, + __local Dtype *group_tmp) { + + int n = get_global_id(1); + for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += + get_global_size(0), ++s) { + float maxval = -FLT_MAX; + for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { + Dtype tmp = data[(n * channels + c) * spatial_dim + s]; + maxval = max((Dtype)tmp, (Dtype)maxval); + } + maxval = sub_group_reduce_max(maxval * 100000); + //if (get_sub_group_local_id() == 0) + group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += + get_global_size(0)) { + int s = index / get_max_sub_group_size(); + Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); + //if (get_sub_group_local_id() == 0) + scale_tmp[s] = maxval / 100000; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int index = get_global_id(0); index < channels * spatial_dim; + index += get_global_size(0)) { + int s = index % spatial_dim; + out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += + get_global_size(0), ++s) { + Dtype sum = 0; + for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { + sum += out_tmp[c * spatial_dim + s]; + } + sum = sub_group_reduce_add(sum * 100000); + group_tmp[get_sub_group_id() * spatial_dim + s] = sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += + get_global_size(0)) { + int s = index / get_max_sub_group_size(); + Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); + //if (get_sub_group_local_id() == 0) + scale_tmp[s] = sum / 100000; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int index = get_global_id(0); index < channels * spatial_dim; + index += get_global_size(0)) { + int s = index % spatial_dim; + out[n * channels * spatial_dim + index] = out_tmp[index] / scale_tmp[s]; + } +} + +__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels, + const int spatial_dim, + __global Dtype* scale, + __global const Dtype* data, + __global Dtype* out) { + + int n = get_global_id(1); + __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; + for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += + get_global_size(0), ++s) { + float maxval = -FLT_MAX; + for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { + Dtype tmp = data[(n * channels + c) * spatial_dim + s]; + maxval = max((Dtype)tmp, (Dtype)maxval); + } + maxval = sub_group_reduce_max(maxval * 100000); + //if (get_sub_group_local_id() == 0) + group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; + } + barrier(CLK_GLOBAL_MEM_FENCE); + + for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += + get_global_size(0)) { + int s = index / get_max_sub_group_size(); + Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); + //if (get_sub_group_local_id() == 0) + scale[n * spatial_dim + s] = maxval / 100000; + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + for (int index = get_global_id(0); index < channels * spatial_dim; + index += get_global_size(0)) { + int s = index % spatial_dim; + out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]); + } + barrier(CLK_GLOBAL_MEM_FENCE); + + for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += + get_global_size(0), ++s) { + Dtype sum = 0; + for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { + sum += out[n * channels * spatial_dim + c * spatial_dim + s]; + } + sum = sub_group_reduce_add(sum * 100000); + group_tmp[get_sub_group_id() * spatial_dim + s] = sum; + } + barrier(CLK_GLOBAL_MEM_FENCE); + + for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index += + get_global_size(0)) { + int s = index / get_max_sub_group_size(); + Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); + //if (get_sub_group_local_id() == 0) + scale[n * spatial_dim + s] = sum / 100000; + } + barrier(CLK_GLOBAL_MEM_FENCE); + + for (int index = get_global_id(0); index < channels * spatial_dim; + index += get_global_size(0)) { + int s = index % spatial_dim; + out[n * channels * spatial_dim + index] /= scale[n * spatial_dim + s]; + } +} diff --git a/modules/dnn/src/precomp.hpp b/modules/dnn/src/precomp.hpp index 9383a08e33..e860598283 100644 --- a/modules/dnn/src/precomp.hpp +++ b/modules/dnn/src/precomp.hpp @@ -40,6 +40,8 @@ //M*/ #include +#include +#include #include #include // int32_t (MSVS 2010-2013) #include "cvconfig.h" diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp index c83c1a063a..1bd3e51ef4 100644 --- a/modules/dnn/test/test_googlenet.cpp +++ b/modules/dnn/test/test_googlenet.cpp @@ -73,6 +73,26 @@ TEST(Reproducibility_GoogLeNet, Accuracy) normAssert(out, ref); } +OCL_TEST(Reproducibility_GoogLeNet, Accuracy) +{ + Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), + findDataFile("dnn/bvlc_googlenet.caffemodel", false)); + + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(DNN_TARGET_OPENCL); + + std::vector inpMats; + inpMats.push_back( imread(_tf("googlenet_0.png")) ); + inpMats.push_back( imread(_tf("googlenet_1.png")) ); + ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty()); + + net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data"); + Mat out = net.forward("prob"); + + Mat ref = blobFromNPY(_tf("googlenet_prob.npy")); + normAssert(out, ref); +} + TEST(IntermediateBlobs_GoogLeNet, Accuracy) { Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), @@ -99,6 +119,35 @@ TEST(IntermediateBlobs_GoogLeNet, Accuracy) } } +OCL_TEST(IntermediateBlobs_GoogLeNet, Accuracy) +{ + Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), + findDataFile("dnn/bvlc_googlenet.caffemodel", false)); + + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(DNN_TARGET_OPENCL); + + std::vector blobsNames; + blobsNames.push_back("conv1/7x7_s2"); + blobsNames.push_back("conv1/relu_7x7"); + blobsNames.push_back("inception_4c/1x1"); + blobsNames.push_back("inception_4c/relu_1x1"); + std::vector outs; + Mat in = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(), Scalar(), false); + net.setInput(in, "data"); + net.forward(outs, blobsNames); + CV_Assert(outs.size() == blobsNames.size()); + + for (int i = 0; i < blobsNames.size(); i++) + { + std::string filename = blobsNames[i]; + std::replace( filename.begin(), filename.end(), '/', '#'); + Mat ref = blobFromNPY(_tf("googlenet_" + filename + ".npy")); + + normAssert(outs[i], ref, "", 1E-4, 1E-2); + } +} + TEST(SeveralCalls_GoogLeNet, Accuracy) { Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), @@ -128,4 +177,36 @@ TEST(SeveralCalls_GoogLeNet, Accuracy) normAssert(outs[0], ref, "", 1E-4, 1E-2); } +OCL_TEST(SeveralCalls_GoogLeNet, Accuracy) +{ + Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), + findDataFile("dnn/bvlc_googlenet.caffemodel", false)); + + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(DNN_TARGET_OPENCL); + + std::vector inpMats; + inpMats.push_back( imread(_tf("googlenet_0.png")) ); + inpMats.push_back( imread(_tf("googlenet_1.png")) ); + ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty()); + + net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data"); + Mat out = net.forward(); + + Mat ref = blobFromNPY(_tf("googlenet_prob.npy")); + normAssert(out, ref); + + std::vector blobsNames; + blobsNames.push_back("conv1/7x7_s2"); + std::vector outs; + Mat in = blobFromImage(inpMats[0], 1.0f, Size(), Scalar(), false); + net.setInput(in, "data"); + net.forward(outs, blobsNames); + CV_Assert(outs.size() == blobsNames.size()); + + ref = blobFromNPY(_tf("googlenet_conv1#7x7_s2.npy")); + + normAssert(outs[0], ref, "", 1E-4, 1E-2); +} + } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index e3aeb7e9be..27c460c031 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -98,7 +98,8 @@ void runLayer(Ptr layer, std::vector &inpBlobs, std::vector &ou } -void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool useCommonInputBlob = true) +void testLayerUsingCaffeModels(String basename, int targetId = DNN_TARGET_CPU, + bool useCaffeModel = false, bool useCommonInputBlob = true) { String prototxt = _tf(basename + ".prototxt"); String caffemodel = _tf(basename + ".caffemodel"); @@ -111,6 +112,9 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool Net net = readNetFromCaffe(prototxt, (useCaffeModel) ? caffemodel : String()); ASSERT_FALSE(net.empty()); + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(targetId); + Mat inp = blobFromNPY(inpfile); Mat ref = blobFromNPY(outfile); @@ -122,47 +126,82 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool TEST(Layer_Test_Softmax, Accuracy) { - testLayerUsingCaffeModels("layer_softmax"); + testLayerUsingCaffeModels("layer_softmax"); +} + +OCL_TEST(Layer_Test_Softmax, Accuracy) +{ + testLayerUsingCaffeModels("layer_softmax", DNN_TARGET_OPENCL); } TEST(Layer_Test_LRN_spatial, Accuracy) { - testLayerUsingCaffeModels("layer_lrn_spatial"); + testLayerUsingCaffeModels("layer_lrn_spatial"); +} + +OCL_TEST(Layer_Test_LRN_spatial, Accuracy) +{ + testLayerUsingCaffeModels("layer_lrn_spatial", DNN_TARGET_OPENCL); } TEST(Layer_Test_LRN_channels, Accuracy) { - testLayerUsingCaffeModels("layer_lrn_channels"); + testLayerUsingCaffeModels("layer_lrn_channels"); +} + +OCL_TEST(Layer_Test_LRN_channels, Accuracy) +{ + testLayerUsingCaffeModels("layer_lrn_channels", DNN_TARGET_OPENCL); } TEST(Layer_Test_Convolution, Accuracy) { - testLayerUsingCaffeModels("layer_convolution", true); + testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_CPU, true); +} + +OCL_TEST(Layer_Test_Convolution, Accuracy) +{ + testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_OPENCL, true); } TEST(Layer_Test_DeConvolution, Accuracy) { - testLayerUsingCaffeModels("layer_deconvolution", true, false); + testLayerUsingCaffeModels("layer_deconvolution", DNN_TARGET_CPU, true, false); } TEST(Layer_Test_InnerProduct, Accuracy) { - testLayerUsingCaffeModels("layer_inner_product", true); + testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_CPU, true); +} + +OCL_TEST(Layer_Test_InnerProduct, Accuracy) +{ + testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_OPENCL, true); } TEST(Layer_Test_Pooling_max, Accuracy) { - testLayerUsingCaffeModels("layer_pooling_max"); + testLayerUsingCaffeModels("layer_pooling_max"); +} + +OCL_TEST(Layer_Test_Pooling_max, Accuracy) +{ + testLayerUsingCaffeModels("layer_pooling_max", DNN_TARGET_OPENCL); } TEST(Layer_Test_Pooling_ave, Accuracy) { - testLayerUsingCaffeModels("layer_pooling_ave"); + testLayerUsingCaffeModels("layer_pooling_ave"); +} + +OCL_TEST(Layer_Test_Pooling_ave, Accuracy) +{ + testLayerUsingCaffeModels("layer_pooling_ave", DNN_TARGET_OPENCL); } TEST(Layer_Test_MVN, Accuracy) { - testLayerUsingCaffeModels("layer_mvn"); + testLayerUsingCaffeModels("layer_mvn"); } void testReshape(const MatShape& inputShape, const MatShape& targetShape, @@ -207,22 +246,32 @@ TEST(Layer_Test_Reshape, Accuracy) TEST(Layer_Test_BatchNorm, Accuracy) { - testLayerUsingCaffeModels("layer_batch_norm", true); + testLayerUsingCaffeModels("layer_batch_norm", DNN_TARGET_CPU, true); } TEST(Layer_Test_ReLU, Accuracy) { - testLayerUsingCaffeModels("layer_relu"); + testLayerUsingCaffeModels("layer_relu"); +} + +OCL_TEST(Layer_Test_ReLU, Accuracy) +{ + testLayerUsingCaffeModels("layer_relu", DNN_TARGET_OPENCL); } TEST(Layer_Test_Dropout, Accuracy) { - testLayerUsingCaffeModels("layer_dropout"); + testLayerUsingCaffeModels("layer_dropout"); } TEST(Layer_Test_Concat, Accuracy) { - testLayerUsingCaffeModels("layer_concat"); + testLayerUsingCaffeModels("layer_concat"); +} + +OCL_TEST(Layer_Test_Concat, Accuracy) +{ + testLayerUsingCaffeModels("layer_concat", DNN_TARGET_OPENCL); } //template diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index ec20ef077e..d83c203a28 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -44,6 +44,7 @@ #include "test_precomp.hpp" #include "npy_blob.hpp" #include +#include namespace cvtest { @@ -70,7 +71,7 @@ TEST(Torch_Importer, simple_read) ASSERT_FALSE(net.empty()); } -static void runTorchNet(String prefix, String outLayerName = "", +static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "", bool check2ndBlob = false, bool isBinary = false) { String suffix = (isBinary) ? ".dat" : ".txt"; @@ -78,6 +79,9 @@ static void runTorchNet(String prefix, String outLayerName = "", Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary); ASSERT_FALSE(net.empty()); + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(targetId); + Mat inp, outRef; ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) ); ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) ); @@ -103,9 +107,19 @@ TEST(Torch_Importer, run_convolution) runTorchNet("net_conv"); } +OCL_TEST(Torch_Importer, run_convolution) +{ + runTorchNet("net_conv", DNN_TARGET_OPENCL); +} + TEST(Torch_Importer, run_pool_max) { - runTorchNet("net_pool_max", "", true); + runTorchNet("net_pool_max", DNN_TARGET_CPU, "", true); +} + +OCL_TEST(Torch_Importer, run_pool_max) +{ + runTorchNet("net_pool_max", DNN_TARGET_OPENCL, "", true); } TEST(Torch_Importer, run_pool_ave) @@ -113,12 +127,17 @@ TEST(Torch_Importer, run_pool_ave) runTorchNet("net_pool_ave"); } +OCL_TEST(Torch_Importer, run_pool_ave) +{ + runTorchNet("net_pool_ave", DNN_TARGET_OPENCL); +} + TEST(Torch_Importer, run_reshape) { runTorchNet("net_reshape"); runTorchNet("net_reshape_batch"); runTorchNet("net_reshape_single_sample"); - runTorchNet("net_reshape_channels", "", false, true); + runTorchNet("net_reshape_channels", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, run_linear) @@ -128,13 +147,19 @@ TEST(Torch_Importer, run_linear) TEST(Torch_Importer, run_paralel) { - runTorchNet("net_parallel", "l5_torchMerge"); + runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge"); } TEST(Torch_Importer, run_concat) { - runTorchNet("net_concat", "l5_torchMerge"); - runTorchNet("net_depth_concat", "", false, true); + runTorchNet("net_concat", DNN_TARGET_CPU, "l5_torchMerge"); + runTorchNet("net_depth_concat", DNN_TARGET_CPU, "", false, true); +} + +OCL_TEST(Torch_Importer, run_concat) +{ + runTorchNet("net_concat", DNN_TARGET_OPENCL, "l5_torchMerge"); + runTorchNet("net_depth_concat", DNN_TARGET_OPENCL, "", false, true); } TEST(Torch_Importer, run_deconv) @@ -163,37 +188,49 @@ TEST(Torch_Importer, net_softmax) runTorchNet("net_softmax_spatial"); } +OCL_TEST(Torch_Importer, net_softmax) +{ + runTorchNet("net_softmax", DNN_TARGET_OPENCL); + runTorchNet("net_softmax_spatial", DNN_TARGET_OPENCL); +} + TEST(Torch_Importer, net_logsoftmax) { runTorchNet("net_logsoftmax"); runTorchNet("net_logsoftmax_spatial"); } +OCL_TEST(Torch_Importer, net_logsoftmax) +{ + runTorchNet("net_logsoftmax", DNN_TARGET_OPENCL); + runTorchNet("net_logsoftmax_spatial", DNN_TARGET_OPENCL); +} + TEST(Torch_Importer, net_lp_pooling) { - runTorchNet("net_lp_pooling_square", "", false, true); - runTorchNet("net_lp_pooling_power", "", false, true); + runTorchNet("net_lp_pooling_square", DNN_TARGET_CPU, "", false, true); + runTorchNet("net_lp_pooling_power", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, net_conv_gemm_lrn) { - runTorchNet("net_conv_gemm_lrn", "", false, true); + runTorchNet("net_conv_gemm_lrn", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, net_inception_block) { - runTorchNet("net_inception_block", "", false, true); + runTorchNet("net_inception_block", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, net_normalize) { - runTorchNet("net_normalize", "", false, true); + runTorchNet("net_normalize", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, net_padding) { - runTorchNet("net_padding", "", false, true); - runTorchNet("net_spatial_zero_padding", "", false, true); + runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true); + runTorchNet("net_spatial_zero_padding", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, ENet_accuracy) @@ -245,6 +282,62 @@ TEST(Torch_Importer, OpenFace_accuracy) normAssert(out, outRef); } +OCL_TEST(Torch_Importer, OpenFace_accuracy) +{ + const string model = findDataFile("dnn/openface_nn4.small2.v1.t7", false); + Net net = readNetFromTorch(model); + + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(DNN_TARGET_OPENCL); + + Mat sample = imread(findDataFile("cv/shared/lena.png", false)); + Mat sampleF32(sample.size(), CV_32FC3); + sample.convertTo(sampleF32, sampleF32.type()); + sampleF32 /= 255; + resize(sampleF32, sampleF32, Size(96, 96), 0, 0, INTER_NEAREST); + + Mat inputBlob = blobFromImage(sampleF32); + + net.setInput(inputBlob); + Mat out = net.forward(); + + Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true); + normAssert(out, outRef); +} + +OCL_TEST(Torch_Importer, ENet_accuracy) +{ + Net net; + { + const string model = findDataFile("dnn/Enet-model-best.net", false); + Ptr importer = createTorchImporter(model, true); + ASSERT_TRUE(importer != NULL); + importer->populateNet(net); + } + + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(DNN_TARGET_OPENCL); + + Mat sample = imread(_tf("street.png", false)); + Mat inputBlob = blobFromImage(sample, 1./255); + + net.setInput(inputBlob, ""); + Mat out = net.forward(); + Mat ref = blobFromNPY(_tf("torch_enet_prob.npy", false)); + // Due to numerical instability in Pooling-Unpooling layers (indexes jittering) + // thresholds for ENet must be changed. Accuracy of resuults was checked on + // Cityscapes dataset and difference in mIOU with Torch is 10E-4% + normAssert(ref, out, "", 0.00044, 0.44); + + const int N = 3; + for (int i = 0; i < N; i++) + { + net.setInput(inputBlob, ""); + Mat out = net.forward(); + normAssert(ref, out, "", 0.00044, 0.44); + } +} + } #endif