Merge pull request #24120 from dkurt:actualize_dnn_links

OCL_FP16 MatMul with large batch

* Workaround FP16 MatMul with large batch

* Fix OCL reinitialization

* Higher thresholds for INT8 quantization

* Try fix gemm_buffer_NT for half (columns)

* Fix GEMM by rows

* Add batch dimension to InnerProduct layer test

* Fix Test_ONNX_conformance.Layer_Test/test_basic_conv_with_padding

* Batch 16

* Replace all vload4

* Version suffix for MobileNetSSD_deploy Caffe model
pull/24167/head
Dmitry Kurtaev 1 year ago committed by GitHub
parent 8d1c73a912
commit 8ad5eb521a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 6
      modules/dnn/misc/python/test/test_dnn.py
  2. 4
      modules/dnn/perf/perf_caffe.cpp
  3. 2
      modules/dnn/perf/perf_net.cpp
  4. 4
      modules/dnn/src/layers/convolution_layer.cpp
  5. 38
      modules/dnn/src/opencl/gemm_buffer.cl
  6. 4
      modules/dnn/test/test_backends.cpp
  7. 4
      modules/dnn/test/test_caffe_importer.cpp
  8. 18
      modules/dnn/test/test_halide_layers.cpp
  9. 6
      modules/dnn/test/test_int8_layers.cpp
  10. 10
      modules/dnn/test/test_model.cpp

@ -191,10 +191,10 @@ class dnn_test(NewOpenCVTests):
def test_model(self):
img_path = self.find_dnn_file("dnn/street.png")
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", required=False)
config = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", required=False)
if weights is None or config is None:
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy_19e3ec3.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
frame = cv.imread(img_path)
model = cv.dnn_DetectionModel(weights, config)

@ -101,8 +101,8 @@ PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
PERF_TEST(MobileNet_SSD, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
"dnn/MobileNetSSD_deploy.caffemodel");
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
"dnn/MobileNetSSD_deploy_19e3ec3.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}

@ -141,7 +141,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}

@ -1069,7 +1069,7 @@ public:
config.pads = pads;
config.stride = stride;
config.dilation = dilation;
if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
if (inputs[0].dims != 4 && inputs[0].dims != (blobs.empty() ? umat_blobs[0].dims : blobs[0].dims))
{
static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
if (!bypassCheck)
@ -1081,7 +1081,7 @@ public:
return false;
}
}
config.group = inputs[0].size[1] / umat_blobs[0].size[1];
config.group = inputs[0].size[1] / (blobs.empty() ? umat_blobs[0].size[1] : blobs[0].size[1]);
if (config.group < 1) // config.group == 0 causes div by zero in ocl4dnn code
{
CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group

@ -453,14 +453,14 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
int w;
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
vstore8(vload8(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
barrier(CLK_LOCAL_MEM_FENCE);
slm_brow0 = slm_brow + local_x * (TILE_K / 8);
@ -469,17 +469,17 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
while( w + TILE_K <= end_w ) {
Dtype8 arow;
brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
brow0 = vload8(0, slm_brow0 + 0 * SLM_BLOCK);
brow1 = vload8(0, slm_brow0 + 1 * SLM_BLOCK);
brow2 = vload8(0, slm_brow0 + 2 * SLM_BLOCK);
brow3 = vload8(0, slm_brow0 + 3 * SLM_BLOCK);
brow4 = vload8(0, slm_brow0 + 4 * SLM_BLOCK);
brow5 = vload8(0, slm_brow0 + 5 * SLM_BLOCK);
brow6 = vload8(0, slm_brow0 + 6 * SLM_BLOCK);
brow7 = vload8(0, slm_brow0 + 7 * SLM_BLOCK);
#define MM_DOT_PRODUCT( _row, _dot ) \
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
arow = vload8(0, src0_read + _row * K); \
_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
@ -510,7 +510,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
Dtype8 arow;
#define READ_BROW(_brow, _row) \
_brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
_brow = vload8(0, slm_brow0 + _row * SLM_BLOCK); \
_brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
_brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
_brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
@ -532,7 +532,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
#undef READ_BROW
#define MM_DOT_PRODUCT( _row, _dot ) \
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
arow = vload8(0, src0_read + _row * K); \
arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \

@ -194,7 +194,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0;
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN;
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
expectNoFallbacksFromIE(net);
}
@ -237,7 +237,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
scoreDiff = 0.03;
iouDiff = 0.08;
}
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
inp, "detection_out", "", scoreDiff, iouDiff);
expectNoFallbacksFromIE(net);
}

@ -290,8 +290,8 @@ TEST(Reproducibility_SSD, Accuracy)
typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
{
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
const string proto = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false);
Net net = readNetFromCaffe(proto, model);
int backendId = get<0>(GetParam());
int targetId = get<1>(GetParam());

@ -407,15 +407,16 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
////////////////////////////////////////////////////////////////////////////////
// Fully-connected
////////////////////////////////////////////////////////////////////////////////
typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
typedef TestWithParam<tuple<int, int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
TEST_P(FullyConnected, Accuracy)
{
int inChannels = get<0>(GetParam());
Size inSize = get<1>(GetParam());
int outChannels = get<2>(GetParam());
bool hasBias = get<3>(GetParam());
Backend backendId = get<0>(get<4>(GetParam()));
Target targetId = get<1>(get<4>(GetParam()));
int batch = get<0>(GetParam());
int inChannels = get<1>(GetParam());
Size inSize = get<2>(GetParam());
int outChannels = get<3>(GetParam());
bool hasBias = get<4>(GetParam());
Backend backendId = get<0>(get<5>(GetParam()));
Target targetId = get<1>(get<5>(GetParam()));
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
@ -439,7 +440,7 @@ TEST_P(FullyConnected, Accuracy)
lp.type = "InnerProduct";
lp.name = "testLayer";
int sz[] = {1, inChannels, inSize.height, inSize.width};
int sz[] = {batch, inChannels, inSize.height, inSize.width};
Mat input(4, &sz[0], CV_32F);
double l1 = 0.0;
@ -467,6 +468,7 @@ TEST_P(FullyConnected, Accuracy)
}
INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
/*batch*/ Values(1, 2, 4, 8, 16),
/*in channels*/ Values(3, 4),
/*in size*/ Values(Size(5, 4), Size(4, 5), Size(1, 1)),
/*out channels*/ Values(3, 4),

@ -878,14 +878,14 @@ TEST_P(Test_Int8_nets, MobileNet_SSD)
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false),
findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false));
Mat inp = imread(_tf("street.png"));
Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
float confThreshold = FLT_MIN, scoreDiff = 0.084, iouDiff = 0.43;
testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
}

@ -490,8 +490,8 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
refBoxes.emplace_back(left, top, width, height);
}
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
Scalar mean = Scalar(127.5, 127.5, 127.5);
double scale = 1.0 / 127.5;
@ -511,7 +511,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
}
else if (target == DNN_TARGET_CUDA_FP16)
{
scoreDiff = 0.0021;
scoreDiff = 0.0028;
iouDiff = 1e-2;
}
float confThreshold = FLT_MIN;
@ -595,8 +595,8 @@ TEST_P(Test_Model, Detection_normalized)
std::vector<float> refConfidences = {0.999222f};
std::vector<Rect2d> refBoxes = {Rect2d(0, 4, 227, 222)};
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
Scalar mean = Scalar(127.5, 127.5, 127.5);
double scale = 1.0 / 127.5;

Loading…
Cancel
Save