diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp index e6a320992e..b91f862d1a 100644 --- a/3rdparty/carotene/hal/tegra_hal.hpp +++ b/3rdparty/carotene/hal/tegra_hal.hpp @@ -1531,7 +1531,7 @@ class TegraCvtColor_##name##_Invoker : public cv::ParallelLoopBody \ public: \ TegraCvtColor_##name##_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, int height_) : \ cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), height(height_) {} \ - virtual void operator()(const cv::Range& range) const \ + virtual void operator()(const cv::Range& range) const CV_OVERRIDE \ { \ CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \ } \ diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 95e98e6e08..260a08fab6 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -1,6 +1,39 @@ add_definitions(-D__OPENCV_BUILD=1) add_definitions(-D__OPENCV_APPS=1) +# Unified function for creating OpenCV applications: +# ocv_add_application(tgt [MODULES [ ...]] SRCS [ ...]) +function(ocv_add_application the_target) + cmake_parse_arguments(APP "" "" "MODULES;SRCS" ${ARGN}) + ocv_check_dependencies(${APP_MODULES}) + if(NOT OCV_DEPENDENCIES_FOUND) + return() + endif() + + project(${the_target}) + ocv_target_include_modules_recurse(${the_target} ${APP_MODULES}) + ocv_target_include_directories(${the_target} PRIVATE "${OpenCV_SOURCE_DIR}/include/opencv") + ocv_add_executable(${the_target} ${APP_SRCS}) + ocv_target_link_libraries(${the_target} ${APP_MODULES}) + set_target_properties(${the_target} PROPERTIES + DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" + ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} + RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} + OUTPUT_NAME "${the_target}") + + if(ENABLE_SOLUTION_FOLDERS) + set_target_properties(${the_target} PROPERTIES FOLDER "applications") + endif() + + if(INSTALL_CREATE_DISTRIB) + if(BUILD_SHARED_LIBS) + install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) + endif() + else() + install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) + endif() +endfunction() + link_libraries(${OPENCV_LINKER_LIBS}) macro(ocv_add_app directory) diff --git a/apps/annotation/CMakeLists.txt b/apps/annotation/CMakeLists.txt index 9288e86b42..a30846db41 100644 --- a/apps/annotation/CMakeLists.txt +++ b/apps/annotation/CMakeLists.txt @@ -1,36 +1,3 @@ -SET(OPENCV_ANNOTATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio) -ocv_check_dependencies(${OPENCV_ANNOTATION_DEPS}) - -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(annotation) -set(the_target opencv_annotation) - -ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv") -ocv_target_include_modules_recurse(${the_target} ${OPENCV_ANNOTATION_DEPS}) - -file(GLOB SRCS *.cpp) - -set(annotation_files ${SRCS}) -ocv_add_executable(${the_target} ${annotation_files}) -ocv_target_link_libraries(${the_target} ${OPENCV_ANNOTATION_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_annotation") - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "applications") -endif() - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) - endif() -else() - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) -endif() +ocv_add_application(opencv_annotation + MODULES opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_videoio + SRCS opencv_annotation.cpp) diff --git a/apps/createsamples/CMakeLists.txt b/apps/createsamples/CMakeLists.txt index a285c69e41..7fb2b679c2 100644 --- a/apps/createsamples/CMakeLists.txt +++ b/apps/createsamples/CMakeLists.txt @@ -1,38 +1,4 @@ -set(OPENCV_CREATESAMPLES_DEPS opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d opencv_videoio) -ocv_check_dependencies(${OPENCV_CREATESAMPLES_DEPS}) - -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(createsamples) -set(the_target opencv_createsamples) - -ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv") -ocv_target_include_modules_recurse(${the_target} ${OPENCV_CREATESAMPLES_DEPS}) - file(GLOB SRCS *.cpp) -file(GLOB HDRS *.h*) - -set(createsamples_files ${SRCS} ${HDRS}) - -ocv_add_executable(${the_target} ${createsamples_files}) -ocv_target_link_libraries(${the_target} ${OPENCV_CREATESAMPLES_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_createsamples") - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "applications") -endif() - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) - endif() -else() - install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) -endif() +ocv_add_application(opencv_createsamples + MODULES opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d opencv_videoio + SRCS ${SRCS}) diff --git a/apps/interactive-calibration/CMakeLists.txt b/apps/interactive-calibration/CMakeLists.txt index 4a1a7446a8..dacbb13c79 100644 --- a/apps/interactive-calibration/CMakeLists.txt +++ b/apps/interactive-calibration/CMakeLists.txt @@ -1,41 +1,6 @@ -set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio) +set(DEPS opencv_core opencv_imgproc opencv_features2d opencv_highgui opencv_calib3d opencv_videoio) if(${BUILD_opencv_aruco}) - list(APPEND OPENCV_INTERACTIVECALIBRATION_DEPS opencv_aruco) + list(APPEND DEPS opencv_aruco) endif() -ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS}) - -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(interactive-calibration) -set(the_target opencv_interactive-calibration) - -ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv") -ocv_target_include_modules_recurse(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS}) - file(GLOB SRCS *.cpp) -file(GLOB HDRS *.h*) - -set(interactive-calibration_files ${SRCS} ${HDRS}) - -ocv_add_executable(${the_target} ${interactive-calibration_files}) -ocv_target_link_libraries(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_interactive-calibration") - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "applications") -endif() - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) - endif() -else() - install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) -endif() +ocv_add_application(opencv_interactive-calibration MODULES ${DEPS} SRCS ${SRCS}) diff --git a/apps/traincascade/CMakeLists.txt b/apps/traincascade/CMakeLists.txt index 96b9781067..ef80ce8b2f 100644 --- a/apps/traincascade/CMakeLists.txt +++ b/apps/traincascade/CMakeLists.txt @@ -1,42 +1,5 @@ -set(OPENCV_TRAINCASCADE_DEPS opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d) -ocv_check_dependencies(${OPENCV_TRAINCASCADE_DEPS}) - -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(traincascade) -set(the_target opencv_traincascade) - -ocv_warnings_disable(CMAKE_CXX_FLAGS -Woverloaded-virtual - -Winconsistent-missing-override -Wsuggest-override -) - -ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv") -ocv_target_include_modules_recurse(${the_target} ${OPENCV_TRAINCASCADE_DEPS}) - +ocv_warnings_disable(CMAKE_CXX_FLAGS -Woverloaded-virtual -Winconsistent-missing-override -Wsuggest-override) file(GLOB SRCS *.cpp) -file(GLOB HDRS *.h*) - -set(traincascade_files ${SRCS} ${HDRS}) - -ocv_add_executable(${the_target} ${traincascade_files}) -ocv_target_link_libraries(${the_target} ${OPENCV_TRAINCASCADE_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_traincascade") - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "applications") -endif() - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) - endif() -else() - install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) -endif() +ocv_add_application(opencv_traincascade + MODULES opencv_core opencv_imgproc opencv_objdetect opencv_imgcodecs opencv_highgui opencv_calib3d opencv_features2d + SRCS ${SRCS}) diff --git a/apps/version/CMakeLists.txt b/apps/version/CMakeLists.txt index cc4abb33aa..89e739b1ba 100644 --- a/apps/version/CMakeLists.txt +++ b/apps/version/CMakeLists.txt @@ -1,49 +1,5 @@ -set(OPENCV_APPLICATION_DEPS opencv_core) -ocv_check_dependencies(${OPENCV_APPLICATION_DEPS}) -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(opencv_version) -set(the_target opencv_version) -ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS}) -ocv_add_executable(${the_target} opencv_version.cpp) -ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_version") - -set_target_properties(${the_target} PROPERTIES FOLDER "applications") - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs) - endif() -else() - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs) -endif() - +ocv_add_application(opencv_version MODULES opencv_core SRCS opencv_version.cpp) if(WIN32) - project(opencv_version_win32) - set(the_target opencv_version_win32) - ocv_target_include_modules_recurse(${the_target} ${OPENCV_APPLICATION_DEPS}) - ocv_add_executable(${the_target} opencv_version.cpp) - ocv_target_link_libraries(${the_target} ${OPENCV_APPLICATION_DEPS}) - target_compile_definitions(${the_target} PRIVATE "OPENCV_WIN32_API=1") - set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_version_win32") - - set_target_properties(${the_target} PROPERTIES FOLDER "applications") - - if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT libs) - endif() - else() - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs) - endif() + ocv_add_application(opencv_version_win32 MODULES opencv_core SRCS opencv_version.cpp) + target_compile_definitions(opencv_version_win32 PRIVATE "OPENCV_WIN32_API=1") endif() diff --git a/apps/visualisation/CMakeLists.txt b/apps/visualisation/CMakeLists.txt index 6f748103e4..eaddf776ec 100644 --- a/apps/visualisation/CMakeLists.txt +++ b/apps/visualisation/CMakeLists.txt @@ -1,36 +1,3 @@ -SET(OPENCV_VISUALISATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs) -ocv_check_dependencies(${OPENCV_VISUALISATION_DEPS}) - -if(NOT OCV_DEPENDENCIES_FOUND) - return() -endif() - -project(visualisation) -set(the_target opencv_visualisation) - -ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv") -ocv_target_include_modules_recurse(${the_target} ${OPENCV_VISUALISATION_DEPS}) - -file(GLOB SRCS *.cpp) - -set(visualisation_files ${SRCS}) -ocv_add_executable(${the_target} ${visualisation_files}) -ocv_target_link_libraries(${the_target} ${OPENCV_VISUALISATION_DEPS}) - -set_target_properties(${the_target} PROPERTIES - DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}" - ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH} - RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} - OUTPUT_NAME "opencv_visualisation") - -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "applications") -endif() - -if(INSTALL_CREATE_DISTRIB) - if(BUILD_SHARED_LIBS) - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev) - endif() -else() - install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev) -endif() +ocv_add_application(opencv_visualisation + MODULES opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs + SRCS opencv_visualisation.cpp) diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index 55b85a0b56..9ba180c7d1 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -361,6 +361,23 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN static Ptr create(const LayerParams& params); }; + /** + * Permute channels of 4-dimensional input blob. + * @param group Number of groups to split input channels and pick in turns + * into output blob. + * + * \f[ groupSize = \frac{number\ of\ channels}{group} \f] + * \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f] + * Read more at https://arxiv.org/pdf/1707.01083.pdf + */ + class CV_EXPORTS ShuffleChannelLayer : public Layer + { + public: + static Ptr create(const LayerParams& params); + + int group; + }; + /** * @brief Adds extra values for specific axes. * @param paddings Vector of paddings in format @@ -575,6 +592,17 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN static Ptr create(const LayerParams& params); }; + /** + * @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public + * + * It differs from @ref ResizeLayer in output shape and resize scales computations. + */ + class CV_EXPORTS InterpLayer : public Layer + { + public: + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS ProposalLayer : public Layer { public: diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 84967ced96..6a7c9d5a6a 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -988,52 +988,26 @@ struct Net::Impl ld.inputBlobsId[inNum] = from; } - static void splitPin(const String &pinAlias, String &layerName, String &outName) - { - size_t delimPos = pinAlias.find('.'); - layerName = pinAlias.substr(0, delimPos); - outName = (delimPos == String::npos) ? String() : pinAlias.substr(delimPos + 1); - } - int resolvePinOutputName(LayerData &ld, const String &outName) { if (outName.empty()) return 0; - - if (std::isdigit(outName[0])) - { - char *lastChar; - long inum = std::strtol(outName.c_str(), &lastChar, 10); - - if (*lastChar == 0) - { - CV_Assert(inum == (int)inum); - return (int)inum; - } - } - return ld.getLayerInstance()->outputNameToIndex(outName); } - LayerPin getPinByAlias(const String &pinAlias) + LayerPin getPinByAlias(const String &layerName) { LayerPin pin; - String layerName, outName; - splitPin(pinAlias, layerName, outName); - pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName); if (pin.lid >= 0) - pin.oid = resolvePinOutputName(getLayerData(pin.lid), outName); + pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName); return pin; } - std::vector getLayerOutPins(const String &pinAlias) + std::vector getLayerOutPins(const String &layerName) { - String layerName, outName; - splitPin(pinAlias, layerName, outName); - int lid = (layerName.empty()) ? 0 : getLayerId(layerName); std::vector pins; @@ -2044,12 +2018,6 @@ int Net::addLayer(const String &name, const String &type, LayerParams ¶ms) { CV_TRACE_FUNCTION(); - if (name.find('.') != String::npos) - { - CV_Error(Error::StsBadArg, "Added layer name \"" + name + "\" must not contain dot symbol"); - return -1; - } - if (impl->getLayerId(name) >= 0) { CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net"); @@ -2689,7 +2657,7 @@ int Layer::inputNameToIndex(String) int Layer::outputNameToIndex(const String&) { - return -1; + return 0; } bool Layer::supportBackend(int backendId) diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index e5c3a279e5..8db0828e62 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -84,6 +84,7 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(Reshape, ReshapeLayer); CV_DNN_REGISTER_LAYER_CLASS(Flatten, FlattenLayer); CV_DNN_REGISTER_LAYER_CLASS(Resize, ResizeLayer); + CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer); CV_DNN_REGISTER_LAYER_CLASS(CropAndResize, CropAndResizeLayer); CV_DNN_REGISTER_LAYER_CLASS(Convolution, ConvolutionLayer); @@ -115,6 +116,7 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(Crop, CropLayer); CV_DNN_REGISTER_LAYER_CLASS(Eltwise, EltwiseLayer); CV_DNN_REGISTER_LAYER_CLASS(Permute, PermuteLayer); + CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannel, ShuffleChannelLayer); CV_DNN_REGISTER_LAYER_CLASS(PriorBox, PriorBoxLayer); CV_DNN_REGISTER_LAYER_CLASS(PriorBoxClustered, PriorBoxLayer); CV_DNN_REGISTER_LAYER_CLASS(Reorg, ReorgLayer); diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp index a9bca1f04b..ad2280f30c 100644 --- a/modules/dnn/src/layers/crop_and_resize_layer.cpp +++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp @@ -1,3 +1,9 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2018, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. #include "../precomp.hpp" #include "layers_common.hpp" diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 5152d60269..499f672918 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -310,7 +310,6 @@ public: innerProductOp = Ptr >(new OCL4DNNInnerProduct(config)); } - UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); for (size_t i = 0; i < inputs.size(); i++) { MatShape inshape, outshape; @@ -320,7 +319,6 @@ public: UMat srcMat, dstMat; srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); - dstMat.setTo(0.0f); if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0], (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(), @@ -332,6 +330,7 @@ public: if (!use_half && bias && (outerSize > 1)) { + UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); UMat& biases = umat_blobs[1]; cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); } @@ -354,6 +353,7 @@ public: if (bias) { + UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type()); UMat& biases = umat_blobs[1]; cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); } diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index 26aa311c25..358ee8dd99 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -11,7 +11,7 @@ namespace cv { namespace dnn { -class ResizeLayerImpl CV_FINAL : public ResizeLayer +class ResizeLayerImpl : public ResizeLayer { public: ResizeLayerImpl(const LayerParams& params) @@ -33,7 +33,7 @@ public: interpolation = params.get("interpolation"); CV_Assert(interpolation == "nearest" || interpolation == "bilinear"); - alignCorners = params.get("align_corners", false); + bool alignCorners = params.get("align_corners", false); if (alignCorners) CV_Error(Error::StsNotImplemented, "Resize with align_corners=true is not implemented"); } @@ -66,6 +66,8 @@ public: outHeight = outputs[0].size[2]; outWidth = outputs[0].size[3]; } + scaleHeight = static_cast(inputs[0]->size[2]) / outHeight; + scaleWidth = static_cast(inputs[0]->size[3]) / outWidth; } void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE @@ -103,8 +105,6 @@ public: const int inpWidth = inp.size[3]; const int inpSpatialSize = inpHeight * inpWidth; const int outSpatialSize = outHeight * outWidth; - const float heightScale = static_cast(inpHeight) / (outHeight); - const float widthScale = static_cast(inpWidth) / (outWidth); const int numPlanes = inp.size[0] * inp.size[1]; CV_Assert(inp.isContinuous(), out.isContinuous()); @@ -112,13 +112,13 @@ public: Mat outPlanes = out.reshape(1, numPlanes * outHeight); for (int y = 0; y < outHeight; ++y) { - float input_y = y * heightScale; + float input_y = y * scaleHeight; int y0 = static_cast(input_y); const float* inpData_row0 = inpPlanes.ptr(y0); const float* inpData_row1 = inpPlanes.ptr(std::min(y0 + 1, inpHeight - 1)); for (int x = 0; x < outWidth; ++x) { - float input_x = x * widthScale; + float input_x = x * scaleWidth; int x0 = static_cast(input_x); int x1 = std::min(x0 + 1, inpWidth - 1); @@ -162,10 +162,10 @@ public: return Ptr(); } -private: +protected: int outWidth, outHeight, zoomFactorWidth, zoomFactorHeight; String interpolation; - bool alignCorners; + float scaleWidth, scaleHeight; }; @@ -174,5 +174,44 @@ Ptr ResizeLayer::create(const LayerParams& params) return Ptr(new ResizeLayerImpl(params)); } +class InterpLayerImpl CV_FINAL : public ResizeLayerImpl +{ +public: + InterpLayerImpl(const LayerParams& params) : ResizeLayerImpl(params) {} + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() == 1, inputs[0].size() == 4); + outputs.resize(1, inputs[0]); + outputs[0][2] = outHeight > 0 ? outHeight : (1 + zoomFactorHeight * (outputs[0][2] - 1)); + outputs[0][3] = outWidth > 0 ? outWidth : (1 + zoomFactorWidth * (outputs[0][3] - 1)); + // We can work in-place (do nothing) if input shape == output shape. + return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]); + } + + virtual void finalize(const std::vector& inputs, std::vector &outputs) CV_OVERRIDE + { + if (!outWidth && !outHeight) + { + outHeight = outputs[0].size[2]; + outWidth = outputs[0].size[3]; + } + int inpHeight = inputs[0]->size[2]; + int inpWidth = inputs[0]->size[3]; + scaleHeight = (outHeight > 1) ? (static_cast(inpHeight - 1) / (outHeight - 1)) : 0.f; + scaleWidth = (outWidth > 1) ? (static_cast(inpWidth - 1) / (outWidth - 1)) : 0.f; + } +}; + +Ptr InterpLayer::create(const LayerParams& params) +{ + LayerParams lp(params); + lp.set("interpolation", "bilinear"); + return Ptr(new InterpLayerImpl(lp)); +} + } // namespace dnn } // namespace cv diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp new file mode 100644 index 0000000000..6c69d773a4 --- /dev/null +++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp @@ -0,0 +1,104 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2018, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +#include "../precomp.hpp" + +namespace cv { namespace dnn { + +class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer +{ +public: + ShuffleChannelLayerImpl(const LayerParams& params) + { + group = params.get("group", 1); + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() == 1 && inputs[0].size() == 4); + CV_Assert(inputs[0][1] % group == 0); + Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + return group == 1; + } + + virtual void finalize(const std::vector& inputs, std::vector &outputs) CV_OVERRIDE + { + if (group != 1) + { + LayerParams lp; + float order[] = {0, 2, 1, 3}; + lp.set("order", DictValue::arrayInt(&order[0], 4)); + permute = PermuteLayer::create(lp); + + Mat inp = *inputs[0]; + Mat out = outputs[0]; + + permuteInpShape.resize(4); + permuteInpShape[0] = inp.size[0]; + permuteInpShape[1] = group; + permuteInpShape[2] = inp.size[1] / group; + permuteInpShape[3] = inp.size[2]*inp.size[3]; + + permuteOutShape.resize(4); + permuteOutShape[0] = permuteInpShape[0]; + permuteOutShape[1] = permuteInpShape[2]; + permuteOutShape[2] = permuteInpShape[1]; + permuteOutShape[3] = permuteInpShape[3]; + + inp = inp.reshape(1, permuteInpShape); + out = out.reshape(1, permuteOutShape); + + std::vector permuteInputs(1, &inp); + std::vector permuteOutputs(1, out); + permute->finalize(permuteInputs, permuteOutputs); + } + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); + } + + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + Mat inp = *inputs[0]; + Mat out = outputs[0]; + if (inp.data != out.data) + { + if (!permute.empty()) + { + inp = inp.reshape(1, permuteInpShape); + out = out.reshape(1, permuteOutShape); + std::vector permuteInputs(1, &inp); + std::vector permuteOutputs(1, out); + permute->forward(permuteInputs, permuteOutputs, internals); + } + else + inp.copyTo(out); + } + } + +private: + Ptr permute; + std::vector permuteInpShape, permuteOutShape; +}; + +Ptr ShuffleChannelLayer::create(const LayerParams& params) +{ + return Ptr(new ShuffleChannelLayerImpl(params)); +} + +} // namespace dnn +} // namespace cv diff --git a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp index 09e0c27473..a25b2bf0d5 100644 --- a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp +++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp @@ -1,23 +1,24 @@ #ifndef _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ #define _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_ -const char *default_kernel_config_intel[] = { +const char *default_kernel_config_intel_fp32[] = { // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version r4.1.61547 + Device Version OpenCL 2.1 NEO + Driver Version 2018ww15-010713 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE @@ -25,11 +26,12 @@ Number of devices 1 Max clock frequency 950MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7F2F00000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -66,15 +68,15 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 26888119911 (25.04GiB) + Global memory size 26892222464 (25.05GiB) Error Correction support No - Max memory allocation 4294959103 (4GiB) + Max memory allocation 4294959104 (4GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes - Fine-grained buffer sharing Yes + Fine-grained buffer sharing No Fine-grained system sharing No - Atomics Yes + Atomics No Minimum alignment for any data type 128 bytes Alignment of base address 1024 bits (128 bytes) Preferred alignment for atomics @@ -82,13 +84,13 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 4294959103 (4GiB) + Preferred total size of global vars 4294959104 (4GiB) Global Memory cache type Read/Write Global Memory cache size 1572864 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 268434943 pixels + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -102,7 +104,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 4294959103 (4GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -120,114 +122,171 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior - clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform - clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform - clCreateContext(NULL, ...) [default] No platform - clCreateContext(NULL, ...) [other] Success [INTEL] - clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform - clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. ********************************************************************************/ -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","5 5 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","6 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 5 16 2 1 1 16 1 0 ", -"EU72_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","6 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 5 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","5 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","2 7 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","2 7 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","2 6 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","5 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 5 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU72_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","9 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","14 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","3 6 16 2 1 1 16 1 0 ", -"EU72_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","3 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","14 1 8 2 1 1 8 1 0 ", -"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","3 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","10 2 8 2 1 1 8 1 0 ", -"EU72_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 3 16 2 1 1 16 1 0 ", -"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","2 8 32 5 1 8 1 1 0 ", + +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "12 2 16 2 1 1 16 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 3 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "2 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ5_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ5_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M12_activ0_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M273_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M63_activ0_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "8 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ5_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M32_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU72_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU72_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU72_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "6 1 16 2 1 1 16 1 0", +"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "4 3 16 2 1 1 16 1 0", // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version r4.1.61547 + Device Version OpenCL 2.1 NEO + Driver Version 18.21.10858 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE @@ -235,11 +294,12 @@ Number of devices 1 Max clock frequency 950MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7F2200000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -276,9 +336,9 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 13361912218 (12.44GiB) + Global memory size 13364170752 (12.45GiB) Error Correction support No - Max memory allocation 4294959103 (4GiB) + Max memory allocation 4294959104 (4GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes @@ -292,13 +352,13 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 4294959103 (4GiB) + Preferred total size of global vars 4294959104 (4GiB) Global Memory cache type Read/Write Global Memory cache size 1048576 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 268434943 pixels + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -312,7 +372,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 4294959103 (4GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -330,14 +390,16 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform @@ -350,106 +412,155 @@ NULL platform behavior clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform ********************************************************************************/ -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","13 1 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 5 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","14 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 4 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","6 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","9 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","5 2 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","3 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","2 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 10 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","2 3 16 2 1 1 16 1 0 ", -"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","3 5 16 2 1 1 16 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","7 2 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","7 3 8 2 1 1 8 1 0 ", -"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","7 3 16 2 1 1 16 1 0 ", -"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","3 7 8 2 1 1 8 1 0 ", -"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","7 4 8 2 1 1 8 1 0 ", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "1 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn128_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "3 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "2 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in48x48_p0x0_num1_M256_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ1_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn32_g1_s1x1_d1x1_b1_in160x160_p0x0_num1_M64_activ5_eltwise0_FP32", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M126_activ0_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "5 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ0_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M546_activ0_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M12_activ0_eltwise0_FP32", "9 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M273_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M63_activ0_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in80x80_p0x0_num1_M128_activ5_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 2 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "4 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn1024_g1024_s1x1_d1x1_b1_in16x16_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s1x1_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g128_s2x2_d1x1_b1_in80x80_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M256_activ5_eltwise0_FP32", "2 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "3 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ5_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s1x1_d1x1_b1_in48x48_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn256_g256_s2x2_d1x1_b1_in48x48_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn32_g32_s1x1_d1x1_b1_in160x160_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M32_activ5_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M32_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s1x1_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn512_g512_s2x2_d1x1_b1_in32x32_p1x1_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M128_activ5_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn64_g1_s2x2_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "1 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p0x0_num1_M1_activ5_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn64_g64_s2x2_d1x1_b1_in160x160_p1x1_num1_M1_activ1_eltwise0_FP32", "1 1 1 6 1 1 1 0 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "4 3 8 2 1 1 8 1 0", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "4 2 8 2 1 1 8 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "4 7 8 2 1 1 8 1 0", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "5 4 16 2 1 1 16 1 0", +"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP32", "3 9 8 2 1 1 8 1 0", // Below is the information for OpenCL based on which these configurations tuned /******************************************************************************* Number of platforms 1 - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Platform Vendor Intel(R) Corporation - Platform Version OpenCL 2.0 + Platform Version OpenCL 2.1 Platform Profile FULL_PROFILE - Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + Platform Host timer resolution 1ns Platform Extensions function suffix INTEL - Platform Name Intel(R) OpenCL + Platform Name Intel(R) OpenCL HD Graphics Number of devices 1 - Device Name Intel(R) HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 - Device Version OpenCL 2.0 - Driver Version 16.5.59288 + Device Version OpenCL 2.1 NEO + Driver Version 18.23.10915 Device OpenCL C Version OpenCL C 2.0 Device Type GPU Device Profile FULL_PROFILE Max compute units 24 - Max clock frequency 1050MHz + Max clock frequency 1150MHz Device Partition (core) Max number of sub-devices 0 - Supported partition types by (0x7FC300000000) + Supported partition types None Max work item dimensions 3 Max work item sizes 256x256x256 Max work group size 256 Preferred work group size multiple 32 + Max sub-groups per work group 32 Preferred / native vector sizes char 16 / 16 short 8 / 8 @@ -486,9 +597,9 @@ Number of devices 1 Support is emulated in software No Correctly-rounded divide and sqrt operations No Address bits 64, Little-Endian - Global memory size 6588809216 (6.136GiB) + Global memory size 6575288320 (6.124GiB) Error Correction support No - Max memory allocation 3294404608 (3.068GiB) + Max memory allocation 3287644160 (3.062GiB) Unified memory for Host and Device Yes Shared Virtual Memory (SVM) capabilities (core) Coarse-grained buffer sharing Yes @@ -502,13 +613,241 @@ Number of devices 1 Global 64 bytes Local 64 bytes Max size for global variable 65536 (64KiB) - Preferred total size of global vars 3294404608 (3.068GiB) + Preferred total size of global vars 3287644160 (3.062GiB) Global Memory cache type Read/Write Global Memory cache size 524288 Global Memory cache line 64 bytes Image support Yes Max number of samplers per kernel 16 - Max size for 1D images from buffer 205900288 pixels + Max size for 1D images from buffer 205477760 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 3287644160 (3.062GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. +********************************************************************************/ +"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32", "2 5 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0_FP32", "7 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "10 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP32", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0_FP32", "1 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0_FP32", "7 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1_FP32", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP32", "2 8 32 5 1 8 1 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP32", "7 1 8 2 1 1 8 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP32", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP32", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP32", "14 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP32", "4 3 8 2 1 1 8 1 0", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "5 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "4 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP32", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP32", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP32", "3 7 8 2 1 1 8 1 0", +"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP32", "9 3 16 2 1 1 16 1 0", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP32", "4 4 16 2 1 1 16 1 0", +}; + +const char *default_kernel_config_intel_fp16[] = { +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL HD Graphics + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.1 + Platform Profile FULL_PROFILE + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation + Platform Host timer resolution 1ns + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL HD Graphics +Number of devices 1 + Device Name Intel(R) Gen9 HD Graphics NEO + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.1 NEO + Driver Version 18.21.10858 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 48 + Max clock frequency 950MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types None + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Max sub-groups per work group 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 13364170752 (12.45GiB) + Error Correction support No + Max memory allocation 4294959104 (4GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 4294959104 (4GiB) + Global Memory cache type Read/Write + Global Memory cache size 1048576 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 268434944 pixels Max 1D or 2D image array size 2048 images Base address alignment for 2D image buffers 4 bytes Pitch alignment for 2D image buffers 4 bytes @@ -522,7 +861,7 @@ Number of devices 1 Max pipe packet size 1024 Local memory type Local Local memory size 65536 (64KiB) - Max constant buffer size 3294404608 (3.068GiB) + Max constant buffer size 4294959104 (4GiB) Max number of constant args 8 Max size of kernel argument 1024 Queue properties (on host) @@ -540,14 +879,16 @@ Number of devices 1 Execution capabilities Run OpenCL kernels Yes Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 SPIR versions 1.2 printf() buffer size 4194304 (4MiB) - Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; Motion Estimation accelerator version (Intel) 2 Device Available Yes Compiler Available Yes Linker Available Yes - Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_fp64 cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation NULL platform behavior clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform @@ -560,153 +901,282 @@ NULL platform behavior clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform ********************************************************************************/ -"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32_activ2_eltwise0","8 3 16 2 1 1 16 1 0 ", -"EU24_k2x2_cn16_g1_s2x2_d1x1_b1_in256x256_p0x0_num1_M16_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M16_activ2_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5_activ0_eltwise0","2 4 8 2 1 1 8 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32_activ2_eltwise0","10 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4_activ2_eltwise0","2 8 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k2x2_cn64_g1_s2x2_d1x1_b1_in128x128_p0x0_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn4_g1_s1x1_d1x1_b1_in256x256_p0x0_num1_M16_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","8 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M16_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4_activ0_eltwise0","1 1 1 4 1 1 1 0 1 ", -"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16_activ2_eltwise0","2 4 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 6 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M4_activ2_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4_activ0_eltwise0","4 4 8 2 1 1 8 1 0 ", -"EU24_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64_activ0_eltwise0","4 1 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M16_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","4 7 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M64_activ3_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16_activ1_eltwise0","8 3 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0","2 8 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192_activ1_eltwise0","14 2 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0","4 6 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in256x256_p0x0_num1_M4_activ2_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M32_activ2_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn32_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M128_activ3_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn16_g1_s1x1_d1x1_b1_in128x128_p0x0_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0","4 3 16 2 1 1 16 1 0 ", -"EU24_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2_activ0_eltwise0","1 3 8 2 1 1 8 1 0 ", -"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13_activ0_eltwise0","1 1 1 4 1 1 1 0 1 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64_activ1_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0","12 2 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ0_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1","1 16 32 5 1 16 1 1 0 ", -"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32_activ1_eltwise0","4 2 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0","4 3 8 2 1 1 8 1 0 ", -"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0","2 8 32 5 1 8 1 1 0 ", -"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0","4 4 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", -"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32_activ1_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32_activ2_eltwise0","1 8 32 5 1 8 1 1 0 ", -"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1","4 7 16 2 1 1 16 1 0 ", -"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0","2 7 16 2 1 1 16 1 0 ", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP16", "11 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP16", "8 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP16", "7 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "6 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "4 1 8 2 1 1 8 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP16", "7 1 8 2 1 1 8 1 0", +"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "5 1 16 2 1 1 16 1 0", +"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP16", "5 6 16 2 1 1 16 1 0", +"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP16", "2 8 16 2 1 1 16 1 0", +"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP16", "9 2 16 2 1 1 16 1 0", +// Below is the information for OpenCL based on which these configurations tuned +/******************************************************************************* +Number of platforms 1 + Platform Name Intel(R) OpenCL HD Graphics + Platform Vendor Intel(R) Corporation + Platform Version OpenCL 2.1 + Platform Profile FULL_PROFILE + Platform Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + Platform Host timer resolution 1ns + Platform Extensions function suffix INTEL + + Platform Name Intel(R) OpenCL HD Graphics +Number of devices 1 + Device Name Intel(R) Gen9 HD Graphics NEO + Device Vendor Intel(R) Corporation + Device Vendor ID 0x8086 + Device Version OpenCL 2.1 NEO + Driver Version 18.23.10915 + Device OpenCL C Version OpenCL C 2.0 + Device Type GPU + Device Profile FULL_PROFILE + Max compute units 24 + Max clock frequency 1150MHz + Device Partition (core) + Max number of sub-devices 0 + Supported partition types None + Max work item dimensions 3 + Max work item sizes 256x256x256 + Max work group size 256 + Preferred work group size multiple 32 + Max sub-groups per work group 32 + Preferred / native vector sizes + char 16 / 16 + short 8 / 8 + int 4 / 4 + long 1 / 1 + half 8 / 8 (cl_khr_fp16) + float 1 / 1 + double 1 / 1 (cl_khr_fp64) + Half-precision Floating-point support (cl_khr_fp16) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Single-precision Floating-point support (core) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations Yes + Double-precision Floating-point support (cl_khr_fp64) + Denormals Yes + Infinity and NANs Yes + Round to nearest Yes + Round to zero Yes + Round to infinity Yes + IEEE754-2008 fused multiply-add Yes + Support is emulated in software No + Correctly-rounded divide and sqrt operations No + Address bits 64, Little-Endian + Global memory size 6575288320 (6.124GiB) + Error Correction support No + Max memory allocation 3287644160 (3.062GiB) + Unified memory for Host and Device Yes + Shared Virtual Memory (SVM) capabilities (core) + Coarse-grained buffer sharing Yes + Fine-grained buffer sharing No + Fine-grained system sharing No + Atomics No + Minimum alignment for any data type 128 bytes + Alignment of base address 1024 bits (128 bytes) + Preferred alignment for atomics + SVM 64 bytes + Global 64 bytes + Local 64 bytes + Max size for global variable 65536 (64KiB) + Preferred total size of global vars 3287644160 (3.062GiB) + Global Memory cache type Read/Write + Global Memory cache size 524288 + Global Memory cache line 64 bytes + Image support Yes + Max number of samplers per kernel 16 + Max size for 1D images from buffer 205477760 pixels + Max 1D or 2D image array size 2048 images + Base address alignment for 2D image buffers 4 bytes + Pitch alignment for 2D image buffers 4 bytes + Max 2D image size 16384x16384 pixels + Max 3D image size 16384x16384x2048 pixels + Max number of read image args 128 + Max number of write image args 128 + Max number of read/write image args 128 + Max number of pipe args 16 + Max active pipe reservations 1 + Max pipe packet size 1024 + Local memory type Local + Local memory size 65536 (64KiB) + Max constant buffer size 3287644160 (3.062GiB) + Max number of constant args 8 + Max size of kernel argument 1024 + Queue properties (on host) + Out-of-order execution Yes + Profiling Yes + Queue properties (on device) + Out-of-order execution Yes + Profiling Yes + Preferred size 131072 (128KiB) + Max size 67108864 (64MiB) + Max queues on device 1 + Max events on device 1024 + Prefer user sync for interop Yes + Profiling timer resolution 83ns + Execution capabilities + Run OpenCL kernels Yes + Run native kernels No + Sub-group independent forward progress Yes + IL version SPIR-V_1.0 + SPIR versions 1.2 + printf() buffer size 4194304 (4MiB) + Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel; + Motion Estimation accelerator version (Intel) 2 + Device Available Yes + Compiler Available Yes + Linker Available Yes + Device Extensions cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_depth_images cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_media_block_io cl_intel_driver_diagnostics cl_intel_device_side_avc_motion_estimation cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_khr_fp64 cl_khr_subgroups cl_khr_il_program cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_advanced_motion_estimation cl_intel_va_api_media_sharing + +NULL platform behavior + clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) Intel(R) OpenCL HD Graphics + clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [INTEL] + clCreateContext(NULL, ...) [default] Success [INTEL] + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform + clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) + Platform Name Intel(R) OpenCL HD Graphics + Device Name Intel(R) Gen9 HD Graphics NEO + +ICD loader properties + ICD loader Name OpenCL ICD Loader + ICD loader Vendor OCL Icd free software + ICD loader Version 2.2.8 + ICD loader Profile OpenCL 1.2 + NOTE: your OpenCL library declares to support OpenCL 1.2, + but it seems to support up to OpenCL 2.1 too. +********************************************************************************/ +"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP16", "2 7 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M2048_activ0_eltwise0_FP16", "7 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn1024_g1_s2x2_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn128_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M512_activ1_eltwise1_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "10 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn2048_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M512_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M1024_activ1_eltwise1_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M128_activ1_eltwise0_FP16", "8 4 16 2 1 1 16 1 0", +"EU24_k1x1_cn256_g1_s2x2_d1x1_b1_in64x64_p0x0_num1_M512_activ0_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M2048_activ1_eltwise1_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24_activ1_eltwise0_FP16", "10 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "9 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M1024_activ0_eltwise0_FP16", "8 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn512_g1_s2x2_d1x1_b1_in32x32_p0x0_num1_M256_activ1_eltwise0_FP16", "7 3 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "8 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ0_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M256_activ1_eltwise1_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64_activ1_eltwise0_FP16", "1 16 32 5 1 16 1 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32_activ1_eltwise0_FP16", "6 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "10 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384_activ1_eltwise0_FP16", "13 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192_activ1_eltwise0_FP16", "13 1 16 2 1 1 16 1 0", +"EU24_k3x3_cn512_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M512_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M64_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208_activ1_eltwise0_FP16", "14 2 16 2 1 1 16 1 0", +"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128_activ1_eltwise0_FP16", "14 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64_activ1_eltwise0_FP16", "7 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96_activ1_eltwise0_FP16", "7 2 16 2 1 1 16 1 0", +"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128_activ1_eltwise0_FP16", "4 1 16 2 1 1 16 1 0", +"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128_activ1_eltwise0_FP16", "7 3 16 2 1 1 16 1 0", +"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64_activ1_eltwise0_FP16", "4 7 16 2 1 1 16 1 0", }; #endif diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index e0ce77e27a..f3a26a3e6d 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -306,6 +306,7 @@ class OCL4DNNConvSpatial std::string kernel_name_; std::string cache_path_; bool use_cache_path_; // true if cache_path_ directory exists + bool run_auto_tuning_; bool force_auto_tuning_; int32_t kernel_index_; std::vector< cv::Ptr > kernelQueue; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 159319425e..034f8d3e7d 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -55,6 +55,7 @@ #include "../include/math_functions.hpp" #include "../include/default_kernel_config.hpp" #include "opencv2/dnn/shape_utils.hpp" +#include "opencv2/core/utils/logger.hpp" #if defined WIN32 || defined _WIN32 #include @@ -67,6 +68,69 @@ typedef std::map kernel_hash_t; static kernel_hash_t kernelConfigMap; static bool defaultConfigLoaded = false; +static std::string sanitize(const std::string& s) +{ + std::string s_ = s; + for (size_t i = 0; i < s_.size(); i++) + { + char c = s_[i]; + if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')) + { + s_[i] = '_'; + } + } + // TODO add hash? + // s_ = s_ + cv::format("_%08llx", crc64((uchar*)s.c_str(), s.size())); + return s_; +} + +static void initializeGlobalBuiltinConfigurations(const std::string& cache_path) +{ + CV_Assert(defaultConfigLoaded == false); + CV_Assert(kernelConfigMap.empty()); + + /* fp32 config */ + size_t numConfigs = sizeof(default_kernel_config_intel_fp32) / + sizeof(default_kernel_config_intel_fp32[0]) / 2; + for (size_t i = 0; i < numConfigs; i++) + { + std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i]; + if (!cache_path.empty()) + { + std::string cacheFile = cache_path + sanitize(key); + std::ifstream cachedKernel(cacheFile.c_str()); + if (cachedKernel) + continue; // external configuration found, skip builtin + } + std::pair entry( + key, + default_kernel_config_intel_fp32[2 * i + 1]); + kernelConfigMap.insert(entry); + } + + /* fp16 config */ + numConfigs = sizeof(default_kernel_config_intel_fp16) / + sizeof(default_kernel_config_intel_fp16[0]) / 2; + for (size_t i = 0; i < numConfigs; i++) + { + std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i]; + if (!cache_path.empty()) + { + std::string cacheFile = cache_path + sanitize(key); + std::ifstream cachedKernel(cacheFile.c_str()); + if (cachedKernel) + continue; // external configuration found, skip builtin + } + std::pair entry( + key, + default_kernel_config_intel_fp16[2 * i + 1]); + kernelConfigMap.insert(entry); + } + + defaultConfigLoaded = true; +} + + template OCL4DNNConvSpatial::OCL4DNNConvSpatial(OCL4DNNConvConfig config) { @@ -139,9 +203,8 @@ OCL4DNNConvSpatial::OCL4DNNConvSpatial(OCL4DNNConvConfig config) } } - force_auto_tuning_ = - (use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false)) - || utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false); + run_auto_tuning_ = use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false); + force_auto_tuning_ = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false); } template @@ -272,40 +335,38 @@ void OCL4DNNConvSpatial::setupKernelDetails(int32_t kernelType, // options options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_; + options_ << " -cl-mad-enable"; if (clOptionSupport("-cl-no-subgroup-ifp")) options_ << " -cl-no-subgroup-ifp "; // defs - int32_t output_width = output_w_; - int32_t output_height = output_h_; int32_t output_block_width = blockM; int32_t output_block_height = blockK; - const int32_t last_block_width = (output_width % output_block_width == 0) ? - output_block_width : output_width % output_block_width; - const int32_t last_block_height = (output_height % output_block_height == 0) ? - output_block_height : output_height % output_block_height; - int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4); - int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_; - int tile_y_stride = (4 * simd_size) / tile_x; - int invec_size = divUp(tile_y, tile_y_stride); + int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_; + int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_; + int invec_size = tile_y; addDef("SIMD_SIZE", simd_size); - addDef("filter_qualifier", "__global"); addDef("OUT_BLOCK_WIDTH", output_block_width); addDef("OUT_BLOCK_HEIGHT", output_block_height); - addDef("LAST_BLOCK_WIDTH", last_block_width); - addDef("LAST_BLOCK_HEIGHT", last_block_height); addDef("INPUT_DEPTH", channels_ / group_); addDef("TOTAL_INPUT_DEPTH_SIZE", channels_); addDef("TOTAL_OUTPUT_DEPTH", num_output_); addDef("NUM_FILTERS", M_); addDef("TILE_X", tile_x); addDef("TILE_Y", tile_y); - addDef("TILE_Y_STRIDE", tile_y_stride); addDef("INVEC_SIZE", invec_size); addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); addDef("APPLY_BIAS", bias_term_); + addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8); + addDef("INPUT_PITCH", (width_ * height_)); + addDef("OUTPUT_PITCH", (output_w_ * output_h_)); + addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_)); + addDef("INPUT_WIDTH", width_); + addDef("INPUT_HEIGHT", height_); + addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size)); + setFusionDefine(fused_activ_, fused_eltwise_); src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; @@ -528,13 +589,6 @@ void OCL4DNNConvSpatial::calculateBenchmark(const UMat &bottom, UMat &ver return; } -#define dbg -#ifdef dbg -#define dbgPrint(x) (x) -#else -#define dbgPrint(x) -#endif - // For large enough input size, we do not need to tune kernels for different // size. The reason is with large input size, there will be enough work items // to feed al the EUs. @@ -545,6 +599,7 @@ void OCL4DNNConvSpatial::calculateBenchmark(const UMat &bottom, UMat &ver template void OCL4DNNConvSpatial::generateKey() { + std::string precision = (use_half_) ? "FP16" : "FP32"; std::stringstream keyBuilder; // FIXME: to support fuse? keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_" @@ -558,21 +613,12 @@ void OCL4DNNConvSpatial::generateKey() << "num" << num_ << "_" << "M" << M_ << "_" << "activ" << fused_activ_ << "_" - << "eltwise" << fused_eltwise_; + << "eltwise" << fused_eltwise_ << "_" + << precision; key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); - key_sanitized_ = key_; - for (size_t i = 0; i < key_sanitized_.size(); i++) - { - char c = key_sanitized_[i]; - if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')) - { - key_sanitized_[i] = '_'; - } - } - // TODO add hash? - // key_sanitized_ = key_sanitized_ + cv::format("_%08llx", crc64((uchar*)key_.c_str(), key_.size())); + key_sanitized_ = sanitize(key_); short_key_ = keyBuilder.str(); } @@ -587,11 +633,6 @@ std::string OCL4DNNConvSpatial::generateSpecificKey(int32_t type, int32_t << "_" << blockHeight << "_" << blockDepth; - if (!use_half_) - keyBuilder << "_float"; - else - keyBuilder << "_half"; - return keyBuilder.str(); } @@ -1135,7 +1176,7 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, cv::ocl::Timer timer(queue); timer.start(); bool res = true;; - dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl); + CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName); tuned_ = true; int loop_cnt = 4; for (int i = 0; i < loop_cnt; i++) { @@ -1152,7 +1193,6 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, } float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt; - #ifdef dbg double out_w = output_w_; double out_h = output_h_; double out_z = M_; @@ -1160,16 +1200,8 @@ float OCL4DNNConvSpatial::timedConvolve(const UMat &bottom, UMat &top, double k_h = kernel_h_; double k_z = channels_; double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; - std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9) - << std::endl; - std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)) - << std::endl; - #if 0 - std::cout << "Estimated utilization: " << - ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0 - << std::endl; - #endif - #endif + CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9)); + CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))); return elapsedTime; } @@ -1225,18 +1257,18 @@ bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) { - dbgPrint(printf("test verification failed @ image %d group %d" - "out_ch %d h %d w %d got %G expected %G\n", - n, g, out_ch, h, w, data[offset], verify_data[offset])); + CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g + << " out_ch " << out_ch << " h " << h << " w " << w + << " got " << data[offset] << " expected " << verify_data[offset]); verificationFail = 1; goto out; } else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) { - dbgPrint(printf("test verification failed @ image %d group %d" - "out_ch %d h %d w %d got %G expected %G\n", - n, g, out_ch, h, w, data[offset], verify_data[offset])); + CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g + << " out_ch " << out_ch << " h " << h << " w " << w + << " got " << data[offset] << " expected " << verify_data[offset]); verificationFail = 1; goto out; } @@ -1517,17 +1549,11 @@ void OCL4DNNConvSpatial::generate_idlf_tuneritems(std::vector< cv::Ptr (4 * simd_size)) - return; - - if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max) + int tile_x = alignSize(actual_tile_x, simd_size); + if (tile_x > simd_size) return; - int tile_y_stride = (4 * simd_size) / tile_x; - int invec_size = divUp(tile_y, tile_y_stride); - if (invec_size > 4) + if (blockM * blockK > block_size_max) return; tunerItems.push_back(makePtr(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size)); @@ -1570,11 +1596,7 @@ void OCL4DNNConvSpatial::generateTunerItems(std::vector< cv::Ptr 0; height--) { generate_idlf_tuneritems(tunerItems, width, height, simd_size); - if (tunerItems.size() >= 8 && height == 2) - break; } - if (tunerItems.size() >= 12 && width == 2) - break; } } } @@ -1661,35 +1683,31 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, if (kernelQueue[x]->tested == false) { bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); if (verified == false) { - dbgPrint(std::cout << "Kernel " - << kernelQueue[x]->kernelName - << " failed verification" << std::endl); - dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: " - << kernelQueue[x]->workItem_output[0] << " " - << "kernelQueue[x]->workItem_output[1]: " - << kernelQueue[x]->workItem_output[1] << " " - << "kernelQueue[x]->workItem_output[2]: " - << kernelQueue[x]->workItem_output[2] << " " - << "kernelQueue[x]->kernelType: " - << kernelQueue[x]->kernelType << " " - << "kernelQueue[x]->global_work_size[0]: " - << kernelQueue[x]->global_work_size[0] << " " - << "kernelQueue[x]->global_work_size[1]: " - << kernelQueue[x]->global_work_size[1] << " " - << "kernelQueue[x]->global_work_size[2]: " - << kernelQueue[x]->global_work_size[2] << " " - << "kernelQueue[x]->local_work_size[0]: " - << kernelQueue[x]->local_work_size[0] << " " - << "kernelQueue[x]->local_work_size[1]: " - << kernelQueue[x]->local_work_size[1] << " " - << "kernelQueue[x]->local_work_size[2]: " - << kernelQueue[x]->local_work_size[2] << " " - << kernelQueue[x]->swizzle_weights << " " - << kernelQueue[x]->use_null_local << std::endl); + CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification"); + CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: " + << kernelQueue[x]->workItem_output[0] << " " + << "kernelQueue[x]->workItem_output[1]: " + << kernelQueue[x]->workItem_output[1] << " " + << "kernelQueue[x]->workItem_output[2]: " + << kernelQueue[x]->workItem_output[2] << " " + << "kernelQueue[x]->kernelType: " + << kernelQueue[x]->kernelType << " " + << "kernelQueue[x]->global_work_size[0]: " + << kernelQueue[x]->global_work_size[0] << " " + << "kernelQueue[x]->global_work_size[1]: " + << kernelQueue[x]->global_work_size[1] << " " + << "kernelQueue[x]->global_work_size[2]: " + << kernelQueue[x]->global_work_size[2] << " " + << "kernelQueue[x]->local_work_size[0]: " + << kernelQueue[x]->local_work_size[0] << " " + << "kernelQueue[x]->local_work_size[1]: " + << kernelQueue[x]->local_work_size[1] << " " + << "kernelQueue[x]->local_work_size[2]: " + << kernelQueue[x]->local_work_size[2] << " " + << kernelQueue[x]->swizzle_weights << " " + << kernelQueue[x]->use_null_local); } else { - dbgPrint(std::cout << "Kernel " - << kernelQueue[x]->kernelName - << " pass verification" << std::endl); + CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification"); } } #endif @@ -1718,19 +1736,28 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, break; } else { kernelQueue[fastestKernel]->tested = true; - dbgPrint(std::cout << "Kernel " << - kernelQueue[fastestKernel]->kernelName << - " failed verification" << std::endl); + CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName << + " failed verification"); failures++; } } } if (verification) { - dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName << - "> passed verification" << std::endl); - dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl); + CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName << + "> passed verification"); + CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime); + double out_w = output_w_; + double out_h = output_h_; + double out_z = M_; + double k_w = kernel_w_; + double k_h = kernel_h_; + double k_z = channels_; + float elapsedTime = kernelQueue[kernel_index_]->executionTime; + double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; + CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9)); + CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))); } else { - dbgPrint(std::cout << "fallback to basic kernel" << std::endl); + CV_LOG_INFO(NULL, "fallback to basic kernel"); options_.str(""); options_.clear(); // clear contents and state flags createBasicKernel(1, 1, 1); kernel_index_ = kernelQueue.size() - 1; @@ -1798,14 +1825,14 @@ void OCL4DNNConvSpatial::prepareKernel(const UMat &bottom, UMat &top, if (loadCachedConfig()) // check in-memory cache return; - if (loadTunedConfig()) // check external storage + if (loadTunedConfig()) // check external storage return; UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1); calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages); - if (force_auto_tuning_) + if (run_auto_tuning_ || force_auto_tuning_) { setupConvolution(bottom, top, weight, bias, numImages, benchData); } @@ -1820,18 +1847,8 @@ template bool OCL4DNNConvSpatial::loadCachedConfig() { cv::AutoLock lock(kernelConfigMutex); - if (!defaultConfigLoaded) - { - const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2; - for (size_t i = 0; i < numConfigs; i++) - { - std::pair entry( - std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i], - default_kernel_config_intel[2 * i + 1]); - kernelConfigMap.insert(entry); - } - defaultConfigLoaded = true; - } + if (!defaultConfigLoaded && !force_auto_tuning_) + initializeGlobalBuiltinConfigurations((use_cache_path_ && !cache_path_.empty()) ? (cache_path_ + '/') : std::string()); kernel_hash_t::iterator it = kernelConfigMap.find(key_); if (it != kernelConfigMap.end()) @@ -1904,9 +1921,12 @@ bool OCL4DNNConvSpatial::setupKernelByConfig(int x, int y, int z, int typ template bool OCL4DNNConvSpatial::loadTunedConfig() { + if (force_auto_tuning_) + return false; // don't load results from external storage + if (!use_cache_path_) { - if (cache_path_.empty() && !force_auto_tuning_) + if (cache_path_.empty()) { static int warn_ = 0; if (!warn_) diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 621ab6f620..dc7b047fe5 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -206,8 +206,6 @@ __kernel void ConvolveBasic( #elif defined KERNEL_IDLF -#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0) - // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image. // NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH @@ -219,190 +217,123 @@ __kernel void convolve_simd( ELTWISE_DATA_ARG FUSED_ARG - __global Dtype* inputs_base, - filter_qualifier Dtype* weights_base, + __global Dtype* inputs, + __global Dtype* weights, BIAS_KERNEL_ARG - __global Dtype* outputs_base, + __global Dtype* outputs, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { - __global Dtype* outputs = outputs_base; - __global Dtype* inputs = inputs_base; - filter_qualifier Dtype* weights = weights_base; unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column - unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row - unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth + unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row + unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth unsigned int fmg = get_group_id(2); unsigned int lid = get_local_id(2); - Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT]; - - int in_addr; + Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f }; // find weights address of given neuron (lid is index) - unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; + unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) * + INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; - for(int i=0;i= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { - if (curr_x < INPUT_PAD_W) { - in_buf.in_vec[reg].s0 = 0; - if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); - else - in_buf.in_vec[reg].s1 = 0; - if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); - else - in_buf.in_vec[reg].s2 = 0; - if (curr_x + 3 < input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); - else - in_buf.in_vec[reg].s3 = 0; - } else { - VLOAD4(in_buf.in_vec[reg], inputs + in_offset); - if (curr_x + 1 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s1 = 0; - if (curr_x + 2 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s2 = 0; - if (curr_x + 3 >= input_width + INPUT_PAD_W) - in_buf.in_vec[reg].s3 = 0; - } - } else { - in_buf.in_vec[reg] = 0; + if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H && + curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W)) + { + in_buf[reg] = 0; } - curr_y += TILE_Y_STRIDE; -#else - VLOAD4(in_buf.in_vec[reg], inputs + in_offset); #endif - } - in_offset += input_width * TILE_Y_STRIDE; - }); - in_addr += input_height * input_width; + curr_y += 1; + in_offset += INPUT_WIDTH; + } + + in_addr += INPUT_PITCH; + #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 curr_y = saved_y; #endif -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 -#define WEIGHT_PREF 8 -#else -#define WEIGHT_PREF 1 -#endif - union { - Dtype w[WEIGHT_PREF]; -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 - INT_TYPE8 ui8; -#endif - } weight_buf; + Dtype weight_buf[WEIGHT_PREF]; int w_idx=0; - unsigned int orig_weight_addr = weight_addr; -#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - weight_addr += SIMD_SIZE * WEIGHT_PREF; -#else - weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr])); - weight_addr += SIMD_SIZE * 1; -#endif + for (int i = 0; i < WEIGHT_PREF; i++) + { + weight_buf[i] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + } -#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) +#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c)) int kr = 0; // kr = Kernel Row LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. + { + int kc = 0; // kc = Kernel Column + LOOP(KERNEL_WIDTH, kc, { - int kc = 0; // kc = Kernel Column - LOOP(KERNEL_WIDTH, kc, - { - for(int br=0; br < OUT_BLOCK_HEIGHT; br++) { - for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) { - Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X); - out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); - } - } -#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF - // We assume KERNEL_W is equal to KERNEL_H here. - if ((w_idx + 1) % WEIGHT_PREF == 0 - #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0 - && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)) - #endif - ) { - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. + for (int br=0; br < OUT_BLOCK_HEIGHT; br++) + { + for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) + { + Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X); + out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); } - #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 - // need to do nothing - #else - else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))) - #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1 - weight_buf.w[0] = weights[weight_addr]; - #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2 - weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]); - #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4 - weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]); - #else - weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); - #endif - #endif -#endif - ++w_idx; - }); + } + weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + ++w_idx; }); - weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; - - } - // dead code to work around possible compiler bug. - if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) { - outputs[0] = BLOCK_IN(fm % SIMD_SIZE); + }); + weight_addr -= WEIGHT_PREF * SIMD_SIZE; } + fm = fm % ALIGNED_NUM_FILTERS; - if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { - unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; - out_addr += or * output_width + oc; - // we need this address calculation for biases because we support views and batching +#if LEFT_FILTERS > 0 + if (fm < NUM_FILTERS) +#endif + { + unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH; + out_addr += or * output_width + oc; + // we need this address calculation for biases because we support views and batching #if APPLY_BIAS - Dtype bias = biases_base[fm]; + Dtype bias = biases_base[fm]; #else - Dtype bias = 0; + Dtype bias = 0; #endif - for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) { + + for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) + { if (r + or >= output_height) break; - for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) { + for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) + { if (c + oc >= output_width) break; - // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. + // this does a scattered write to SIMD_SIZE different feature maps, + // so that data within one map is contiguous, thus ready for input to next layer. ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm); - } } } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 111f354fe4..3de7f61c5d 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -1137,11 +1137,95 @@ private: int outWidth, outHeight, zoomFactor; }; -TEST(Layer_Test_Interp, Accuracy) +TEST(Layer_Test_Interp_custom, Accuracy) { CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer); testLayerUsingCaffeModels("layer_interp", DNN_TARGET_CPU, false, false); LayerFactory::unregisterLayer("Interp"); } +TEST(Layer_Test_Interp, Accuracy) +{ + testLayerUsingCaffeModels("layer_interp", DNN_TARGET_CPU, false, false); +} + +TEST(Layer_Test_PoolingIndices, Accuracy) +{ + Net net; + + LayerParams lp; + lp.set("pool", "max"); + lp.set("kernel_w", 2); + lp.set("kernel_h", 2); + lp.set("stride_w", 2); + lp.set("stride_h", 2); + lp.set("pad_w", 0); + lp.set("pad_h", 0); + lp.name = "testLayer.name"; // This test also checks that OpenCV lets use names with dots. + lp.type = "Pooling"; + net.addLayerToPrev(lp.name, lp.type, lp); + + Mat inp(10, 10, CV_8U); + randu(inp, 0, 255); + + Mat maxValues(5, 5, CV_32F, Scalar(-1)), indices(5, 5, CV_32F, Scalar(-1)); + for (int y = 0; y < 10; ++y) + { + int dstY = y / 2; + for (int x = 0; x < 10; ++x) + { + int dstX = x / 2; + uint8_t val = inp.at(y, x); + if ((float)inp.at(y, x) > maxValues.at(dstY, dstX)) + { + maxValues.at(dstY, dstX) = val; + indices.at(dstY, dstX) = y * 10 + x; + } + } + } + net.setInput(blobFromImage(inp)); + + std::vector outputs; + net.forward(outputs, lp.name); + normAssert(maxValues, outputs[0].reshape(1, 5)); + normAssert(indices, outputs[1].reshape(1, 5)); +} + +typedef testing::TestWithParam > Layer_Test_ShuffleChannel; +TEST_P(Layer_Test_ShuffleChannel, Accuracy) +{ + Vec4i inpShapeVec = get<0>(GetParam()); + int group = get<1>(GetParam()); + ASSERT_EQ(inpShapeVec[1] % group, 0); + const int groupSize = inpShapeVec[1] / group; + + Net net; + LayerParams lp; + lp.set("group", group); + lp.type = "ShuffleChannel"; + lp.name = "testLayer"; + net.addLayerToPrev(lp.name, lp.type, lp); + + const int inpShape[] = {inpShapeVec[0], inpShapeVec[1], inpShapeVec[2], inpShapeVec[3]}; + Mat inp(4, inpShape, CV_32F); + randu(inp, 0, 255); + + net.setInput(inp); + Mat out = net.forward(); + + for (int n = 0; n < inpShapeVec[0]; ++n) + { + for (int c = 0; c < inpShapeVec[1]; ++c) + { + Mat outChannel = getPlane(out, n, c); + Mat inpChannel = getPlane(inp, n, groupSize * (c % group) + c / group); + normAssert(outChannel, inpChannel); + } + } +} +INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_ShuffleChannel, Combine( +/*input shape*/ Values(Vec4i(1, 6, 5, 7), Vec4i(3, 12, 1, 4)), +/*group*/ Values(1, 2, 3, 6) +)); + }} // namespace diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index a8c1d15503..5fe3fe121b 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -87,7 +87,7 @@ static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String out if (outLayerName.empty()) outLayerName = net.getLayerNames().back(); - net.setInput(inp, "0"); + net.setInput(inp); std::vector outBlobs; net.forward(outBlobs, outLayerName); normAssert(outRef, outBlobs[0]); diff --git a/modules/videoio/src/cap_openni2.cpp b/modules/videoio/src/cap_openni2.cpp index b4a7808363..9a67a417f6 100644 --- a/modules/videoio/src/cap_openni2.cpp +++ b/modules/videoio/src/cap_openni2.cpp @@ -70,6 +70,35 @@ #include "PS1080.h" /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +static cv::Mutex initOpenNI2Mutex; + +struct OpenNI2Initializer +{ +public: + static void init() + { + cv::AutoLock al(initOpenNI2Mutex); + static OpenNI2Initializer initializer; + } + +private: + OpenNI2Initializer() + { + // Initialize and configure the context. + openni::Status status = openni::OpenNI::initialize(); + if (status != openni::STATUS_OK) + { + CV_Error(CV_StsError, std::string("Failed to initialize:") + openni::OpenNI::getExtendedError()); + } + } + + ~OpenNI2Initializer() + { + openni::OpenNI::shutdown(); + } +}; + class CvCapture_OpenNI2 : public CvCapture { public: @@ -107,6 +136,8 @@ protected: static openni::VideoMode defaultStreamOutputMode(int stream); + CvCapture_OpenNI2(int index, const char * filename); + IplImage* retrieveDepthMap(); IplImage* retrievePointCloudMap(); IplImage* retrieveDisparityMap(); @@ -116,8 +147,8 @@ protected: IplImage* retrieveGrayImage(); IplImage* retrieveIrImage(); - openni::Status toggleStream(int stream, bool toggle); - bool readCamerasParams(); + void toggleStream(int stream, bool toggle); + void readCamerasParams(); double getDepthGeneratorProperty(int propIdx) const; bool setDepthGeneratorProperty(int propIdx, double propVal); @@ -131,12 +162,11 @@ protected: // OpenNI context openni::Device device; bool isContextOpened; - openni::Recorder recorder; // Data generators with its metadata - openni::VideoStream streams[CV_MAX_NUM_STREAMS]; - openni::VideoFrameRef streamFrames[CV_MAX_NUM_STREAMS]; - cv::Mat streamImages[CV_MAX_NUM_STREAMS]; + std::vector streams; + std::vector streamFrames; + std::vector streamImages; int maxBufferSize, maxTimeDuration; // for approx sync bool isCircleBuffer; @@ -191,80 +221,103 @@ openni::VideoMode CvCapture_OpenNI2::defaultStreamOutputMode(int stream) return mode; } -CvCapture_OpenNI2::CvCapture_OpenNI2( int index ) -{ - const char* deviceURI = openni::ANY_DEVICE; - openni::Status status; - int deviceType = DEVICE_DEFAULT; - - noSampleValue = shadowValue = 0; - - isContextOpened = false; - maxBufferSize = DEFAULT_MAX_BUFFER_SIZE; - isCircleBuffer = DEFAULT_IS_CIRCLE_BUFFER; - maxTimeDuration = DEFAULT_MAX_TIME_DURATION; - - if( index >= 10 ) - { - deviceType = index / 10; - index %= 10; - } +CvCapture_OpenNI2::CvCapture_OpenNI2(int index) : + CvCapture_OpenNI2(index, nullptr) +{ } + +CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename) : + CvCapture_OpenNI2(-1, filename) +{ } + +CvCapture_OpenNI2::CvCapture_OpenNI2(int index, const char * filename) : + device(), + isContextOpened(false), + streams(CV_MAX_NUM_STREAMS), + streamFrames(CV_MAX_NUM_STREAMS), + streamImages(CV_MAX_NUM_STREAMS), + maxBufferSize(DEFAULT_MAX_BUFFER_SIZE), + maxTimeDuration(DEFAULT_MAX_TIME_DURATION), + isCircleBuffer(DEFAULT_IS_CIRCLE_BUFFER), + baseline(0), + depthFocalLength_VGA(0), + shadowValue(0), + noSampleValue(0), + outputMaps(outputMapsTypesCount) +{ // Initialize and configure the context. - status = openni::OpenNI::initialize(); + OpenNI2Initializer::init(); - if (status != openni::STATUS_OK) + const char* deviceURI = openni::ANY_DEVICE; + bool needColor = true; + bool needIR = true; + if (index >= 0) { - CV_Error(CV_StsError, cv::format("Failed to initialize:", openni::OpenNI::getExtendedError())); - return; - } + int deviceType = DEVICE_DEFAULT; + if (index >= 10) + { + deviceType = index / 10; + index %= 10; + } + // Asus XTION and Occipital Structure Sensor do not have an image generator + needColor = (deviceType != DEVICE_ASUS_XTION); - // find appropriate device URI - openni::Array ldevs; - if (index > 0) - { - openni::OpenNI::enumerateDevices(&ldevs); - deviceURI = ldevs[index].getUri(); + // find appropriate device URI + openni::Array ldevs; + if (index > 0) + { + openni::OpenNI::enumerateDevices(&ldevs); + if (index < ldevs.getSize()) + deviceURI = ldevs[index].getUri(); + else + { + CV_Error(CV_StsError, "OpenCVKinect2: Device index exceeds the number of available OpenNI devices"); + } + } } - - status = device.open(deviceURI); - if( status != openni::STATUS_OK ) + else { - CV_Error(CV_StsError, cv::format("OpenCVKinect: Device open failed see: %s\n", openni::OpenNI::getExtendedError())); - openni::OpenNI::shutdown(); - return; + deviceURI = filename; } - status = toggleStream(CV_DEPTH_STREAM, true); - // Asus XTION and Occipital Structure Sensor do not have an image generator - if (deviceType != DEVICE_ASUS_XTION) - status = openni::Status(status | toggleStream(CV_COLOR_STREAM, true)); + openni::Status status; + status = device.open(deviceURI); if (status != openni::STATUS_OK) { - openni::OpenNI::shutdown(); - return; + CV_Error(CV_StsError, std::string("OpenCVKinect2: Failed to open device: ") + openni::OpenNI::getExtendedError()); } - if (!readCamerasParams()) - { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n")); - return; - } + toggleStream(CV_DEPTH_STREAM, true); + if (needColor) + toggleStream(CV_COLOR_STREAM, true); + if (needIR) + toggleStream(CV_IR_STREAM, true); + setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0); - outputMaps.resize( outputMapsTypesCount ); + // default for Kinect2 camera + setProperty(CV_CAP_PROP_OPENNI2_MIRROR, 0.0); isContextOpened = true; +} - setProperty(CV_CAP_PROP_OPENNI_REGISTRATION, 1.0); +CvCapture_OpenNI2::~CvCapture_OpenNI2() +{ + for (size_t i = 0; i < streams.size(); ++i) + { + streamFrames[i].release(); + streams[i].stop(); + streams[i].destroy(); + } + device.close(); } -openni::Status CvCapture_OpenNI2::toggleStream(int stream, bool toggle) +void CvCapture_OpenNI2::toggleStream(int stream, bool toggle) { openni::Status status; // for logging - static const char* stream_names[CV_MAX_NUM_STREAMS] = { + static const std::string stream_names[CV_MAX_NUM_STREAMS] = { "depth", "color", "IR" @@ -280,140 +333,92 @@ openni::Status CvCapture_OpenNI2::toggleStream(int stream, bool toggle) { // already opened if (streams[stream].isValid()) - return openni::STATUS_OK; + return; // open stream status = streams[stream].create(device, stream_sensor_types[stream]); if (status == openni::STATUS_OK) { - // set video mode - status = streams[stream].setVideoMode(defaultStreamOutputMode(stream)); // xn::DepthGenerator supports VGA only! (Jan 2011) - if (status != openni::STATUS_OK) + // try to set up default stream mode (if available) + const openni::Array& vm = streams[stream].getSensorInfo().getSupportedVideoModes(); + openni::VideoMode dm = defaultStreamOutputMode(stream); + for (int i = 0; i < vm.getSize(); i++) { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't set %s stream output mode: %s\n", - stream_names[stream], - openni::OpenNI::getExtendedError())); - streams[stream].destroy(); - return status; + if (vm[i].getPixelFormat() == dm.getPixelFormat() && + vm[i].getResolutionX() == dm.getResolutionX() && + vm[i].getResolutionY() == dm.getResolutionY() && + vm[i].getFps() == dm.getFps()) + { + status = streams[stream].setVideoMode(defaultStreamOutputMode(stream)); + if (status != openni::STATUS_OK) + { + streams[stream].destroy(); + CV_Error(CV_StsError, std::string("OpenCVKinect2 : Couldn't set ") + + stream_names[stream] + std::string(" stream output mode: ") + + std::string(openni::OpenNI::getExtendedError())); + } + } } // start stream status = streams[stream].start(); if (status != openni::STATUS_OK) { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start %s stream: %s\n", - stream_names[stream], - openni::OpenNI::getExtendedError())); streams[stream].destroy(); - return status; + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start ") + + stream_names[stream] + std::string(" stream: ") + + std::string(openni::OpenNI::getExtendedError())); } } else { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find %s stream:: %s\n", - stream_names[stream], - openni::OpenNI::getExtendedError())); - return status; + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find ") + + stream_names[stream] + " stream: " + + std::string(openni::OpenNI::getExtendedError())); } } else if (streams[stream].isValid()) // want to close stream { - streams[stream].stop(); - streams[stream].destroy(); - } - - return openni::STATUS_OK; -} - -CvCapture_OpenNI2::CvCapture_OpenNI2(const char * filename) -{ - openni::Status status; - - isContextOpened = false; - maxBufferSize = DEFAULT_MAX_BUFFER_SIZE; - isCircleBuffer = DEFAULT_IS_CIRCLE_BUFFER; - maxTimeDuration = DEFAULT_MAX_TIME_DURATION; - - // Initialize and configure the context. - status = openni::OpenNI::initialize(); - - if (status != openni::STATUS_OK) - { - CV_Error(CV_StsError, cv::format("Failed to initialize:", openni::OpenNI::getExtendedError())); - return; - } + //FIX for libfreenect2 + //which stops the whole device when stopping only one stream - // Open file - status = device.open(filename); - if( status != openni::STATUS_OK ) - { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Failed to open input file (%s): %s\n", filename, openni::OpenNI::getExtendedError())); - return; + //streams[stream].stop(); + //streams[stream].destroy(); } - - status = openni::Status(toggleStream(CV_DEPTH_STREAM, true) | toggleStream(CV_COLOR_STREAM, true)); - if (status != openni::STATUS_OK) - { - openni::OpenNI::shutdown(); - return; - } - - if( !readCamerasParams() ) - { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::CvCapture_OpenNI2 : Could not read cameras parameters\n")); - return; - } - - outputMaps.resize( outputMapsTypesCount ); - - isContextOpened = true; } -CvCapture_OpenNI2::~CvCapture_OpenNI2() -{ - for (int i = 0; i < CV_MAX_NUM_STREAMS; ++i) - { - streamFrames[i].release(); - streams[i].stop(); - streams[i].destroy(); - } - device.close(); - openni::OpenNI::shutdown(); -} -bool CvCapture_OpenNI2::readCamerasParams() +void CvCapture_OpenNI2::readCamerasParams() { double pixelSize = 0; if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_PIXEL_SIZE, &pixelSize) != openni::STATUS_OK) { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!\n")); - return false; + CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!" + + std::string(openni::OpenNI::getExtendedError())); } // pixel size @ VGA = pixel size @ SXGA x 2 pixelSize *= 2.0; // in mm // focal length of IR camera in pixels for VGA resolution - int zeroPlanDistance; // in mm - if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlanDistance) != openni::STATUS_OK) + unsigned long long zeroPlaneDistance; // in mm + if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlaneDistance) != openni::STATUS_OK) { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!\n")); - return false; + CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!" + + std::string(openni::OpenNI::getExtendedError())); } if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_EMITTER_DCMOS_DISTANCE, &baseline) != openni::STATUS_OK) { - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::readCamerasParams : Could not read base line!\n")); - return false; + CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read base line!" + + std::string(openni::OpenNI::getExtendedError())); } // baseline from cm -> mm baseline *= 10; // focal length from mm -> pixels (valid for 640x480) - depthFocalLength_VGA = (int)((double)zeroPlanDistance / (double)pixelSize); - - return true; + depthFocalLength_VGA = (int)((double)zeroPlaneDistance / (double)pixelSize); } double CvCapture_OpenNI2::getProperty( int propIdx ) const @@ -500,7 +505,7 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const break; } default : - CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for getting.\n", propIdx) ); + CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for getting.", propIdx) ); } return propValue; @@ -525,14 +530,20 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue ) // There is a set of properties that correspond to depth generator by default // (is they are pass without particular generator flag). case CV_CAP_PROP_OPENNI_REGISTRATION: - isSet = setDepthGeneratorProperty( propIdx, propValue ); + isSet = setDepthGeneratorProperty(propIdx, propValue); break; case CV_CAP_PROP_OPENNI2_SYNC: isSet = device.setDepthColorSyncEnabled(propValue > 0.0) == openni::STATUS_OK; break; + case CV_CAP_PROP_FRAME_WIDTH: + case CV_CAP_PROP_FRAME_HEIGHT: + case CV_CAP_PROP_AUTOFOCUS: + isSet = false; + break; + default: - CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.\n", propIdx) ); + CV_Error(CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.", propIdx)); } return isSet; @@ -565,9 +576,13 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const propValue = streams[CV_DEPTH_STREAM].getMaxPixelValue(); break; case CV_CAP_PROP_OPENNI_BASELINE : + if(baseline <= 0) + const_cast(this)->readCamerasParams(); propValue = baseline; break; case CV_CAP_PROP_OPENNI_FOCAL_LENGTH : + if(depthFocalLength_VGA <= 0) + const_cast(this)->readCamerasParams(); propValue = (double)depthFocalLength_VGA; break; case CV_CAP_PROP_OPENNI_REGISTRATION : @@ -580,7 +595,7 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const propValue = streamFrames[CV_DEPTH_STREAM].getFrameIndex(); break; default : - CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) ); + CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.", propIdx) ); } return propValue; @@ -594,7 +609,10 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue { case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT: if (isContextOpened) - isSet = toggleStream(CV_DEPTH_STREAM, propValue > 0.0) == openni::STATUS_OK; + { + toggleStream(CV_DEPTH_STREAM, propValue > 0.0); + isSet = true; + } break; case CV_CAP_PROP_OPENNI_REGISTRATION: { @@ -612,12 +630,13 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue { openni::Status status = device.setImageRegistrationMode(mode); if( status != openni::STATUS_OK ) - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : %s\n", openni::OpenNI::getExtendedError())); + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") + + std::string(openni::OpenNI::getExtendedError())); else isSet = true; } else - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : Unsupported viewpoint.\n")); + CV_Error(CV_StsError, "CvCapture_OpenNI2::setDepthGeneratorProperty: Unsupported viewpoint."); } else isSet = true; @@ -627,14 +646,15 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue { openni::Status status = device.setImageRegistrationMode(openni::IMAGE_REGISTRATION_OFF); if( status != openni::STATUS_OK ) - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setDepthGeneratorProperty : %s\n", openni::OpenNI::getExtendedError())); + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") + + std::string(openni::OpenNI::getExtendedError())); else isSet = true; } } break; default: - CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for setting.\n", propIdx) ); + CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for setting.", propIdx) ); } return isSet; @@ -668,7 +688,7 @@ double CvCapture_OpenNI2::getImageGeneratorProperty( int propIdx ) const propValue = (double)streamFrames[CV_COLOR_STREAM].getFrameIndex(); break; default : - CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx) ); + CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx) ); } return propValue; @@ -682,7 +702,10 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue) { case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT: if (isContextOpened) - isSet = toggleStream(CV_COLOR_STREAM, propValue > 0.0) == openni::STATUS_OK; + { + toggleStream(CV_COLOR_STREAM, propValue > 0.0); + isSet = true; + } break; case CV_CAP_PROP_OPENNI_OUTPUT_MODE : { @@ -713,18 +736,19 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue) mode.setFps(60); break; default : - CV_Error( CV_StsBadArg, "Unsupported image generator output mode.\n"); + CV_Error( CV_StsBadArg, "Unsupported image generator output mode."); } openni::Status status = streams[CV_COLOR_STREAM].setVideoMode( mode ); if( status != openni::STATUS_OK ) - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError())); + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") + + std::string(openni::OpenNI::getExtendedError())); else isSet = true; break; } default: - CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.\n", propIdx) ); + CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx) ); } return isSet; @@ -758,7 +782,7 @@ double CvCapture_OpenNI2::getIrGeneratorProperty(int propIdx) const propValue = (double)streamFrames[CV_IR_STREAM].getFrameIndex(); break; default: - CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.\n", propIdx)); + CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx)); } return propValue; @@ -772,7 +796,10 @@ bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue) { case CV_CAP_PROP_OPENNI_GENERATOR_PRESENT: if (isContextOpened) - isSet = toggleStream(CV_IR_STREAM, propValue > 0.0) == openni::STATUS_OK; + { + toggleStream(CV_IR_STREAM, propValue > 0.0); + isSet = true; + } break; case CV_CAP_PROP_OPENNI_OUTPUT_MODE: { @@ -803,18 +830,19 @@ bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue) mode.setFps(60); break; default: - CV_Error(CV_StsBadArg, "Unsupported image generator output mode.\n"); + CV_Error(CV_StsBadArg, "Unsupported image generator output mode."); } openni::Status status = streams[CV_IR_STREAM].setVideoMode(mode); if (status != openni::STATUS_OK) - CV_Error(CV_StsError, cv::format("CvCapture_OpenNI2::setImageGeneratorProperty : %s\n", openni::OpenNI::getExtendedError())); + CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") + + std::string(openni::OpenNI::getExtendedError())); else isSet = true; break; } default: - CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.\n", propIdx)); + CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx)); } return isSet; @@ -931,10 +959,12 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap() if (!streamFrames[CV_DEPTH_STREAM].isValid()) return 0; + readCamerasParams(); + cv::Mat disp32; computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], disp32, baseline, depthFocalLength_VGA, noSampleValue, shadowValue); - disp32.convertTo( outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].mat, CV_8UC1 ); + disp32.convertTo(outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].mat, CV_8UC1); return outputMaps[CV_CAP_OPENNI_DISPARITY_MAP].getIplImagePtr(); } @@ -944,6 +974,8 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap_32F() if (!streamFrames[CV_DEPTH_STREAM].isValid()) return 0; + readCamerasParams(); + computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].mat, baseline, depthFocalLength_VGA, noSampleValue, shadowValue); return outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].getIplImagePtr(); @@ -966,7 +998,7 @@ inline void getBGRImageFromMetaData( const openni::VideoFrameRef& imageMetaData, { cv::Mat bufferImage; if( imageMetaData.getVideoMode().getPixelFormat() != openni::PIXEL_FORMAT_RGB888 ) - CV_Error( CV_StsUnsupportedFormat, "Unsupported format of grabbed image\n" ); + CV_Error( CV_StsUnsupportedFormat, "Unsupported format of grabbed image." ); bgrImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3); bufferImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3); @@ -989,7 +1021,7 @@ inline void getGrayImageFromMetaData(const openni::VideoFrameRef& imageMetaData, } else { - CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image\n"); + CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image."); } } diff --git a/samples/cpp/videocapture_basic.cpp b/samples/cpp/videocapture_basic.cpp index 6f46715458..75992f6a0c 100644 --- a/samples/cpp/videocapture_basic.cpp +++ b/samples/cpp/videocapture_basic.cpp @@ -20,7 +20,7 @@ int main(int, char**) //--- INITIALIZE VIDEOCAPTURE VideoCapture cap; // open the default camera using default API - cap.open(0); + // cap.open(0); // OR advance usage: select any API backend int deviceID = 0; // 0 = open default camera int apiID = cv::CAP_ANY; // 0 = autodetect default API diff --git a/samples/data/dnn/object_detection_classes_coco.txt b/samples/data/dnn/object_detection_classes_coco.txt index 75aa546f48..10ecf0b455 100644 --- a/samples/data/dnn/object_detection_classes_coco.txt +++ b/samples/data/dnn/object_detection_classes_coco.txt @@ -9,7 +9,7 @@ truck boat traffic light fire hydrant - +street sign stop sign parking meter bench @@ -23,11 +23,11 @@ elephant bear zebra giraffe - +hat backpack umbrella - - +shoe +eye glasses handbag tie suitcase @@ -42,7 +42,7 @@ skateboard surfboard tennis racket bottle - +plate wine glass cup fork @@ -63,12 +63,12 @@ chair couch potted plant bed - +mirror dining table - - +window +desk toilet - +door tv laptop mouse @@ -80,7 +80,7 @@ oven toaster sink refrigerator - +blender book clock vase diff --git a/samples/data/dnn/object_detection_classes_yolov3.txt b/samples/data/dnn/object_detection_classes_yolov3.txt new file mode 100644 index 0000000000..941cb4e139 --- /dev/null +++ b/samples/data/dnn/object_detection_classes_yolov3.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorcycle +airplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +couch +potted plant +bed +dining table +toilet +tv +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/samples/opencl/opencl-opencv-interop.cpp b/samples/opencl/opencl-opencv-interop.cpp index c6630ea45b..d2961af777 100644 --- a/samples/opencl/opencl-opencv-interop.cpp +++ b/samples/opencl/opencl-opencv-interop.cpp @@ -14,6 +14,8 @@ #include #include +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_USE_DEPRECATED_OPENCL_2_0_APIS // eliminate build warning #ifdef __APPLE__