dnn(ocl): avoid mess FP16/FP32 in convolution layer

4 years ago · c240355cc6
parent 1bfc75ac23
commit c240355cc6
6 changed files with 77 additions and 61 deletions
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
                              sdepth == CV_32F ? "half" : "float",
                              rowsPerWI,
                              sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
-    ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
+    ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
    if (k.empty())
        return false;
--- a/modules/core/src/opencl/halfconvert.cl
+++ b/modules/core/src/opencl/halfconvert.cl
@ -47,8 +47,17 @@
 #endif
 #endif
-__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
+__kernel void
-                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+#ifdef FLOAT_TO_HALF
    convertFp16_FP32_to_FP16
 #else
    convertFp16_FP16_to_FP32
 #endif
 (
    __global const uchar * srcptr, int src_step, int src_offset,
    __global uchar * dstptr, int dst_step, int dst_offset,
    int dst_rows, int dst_cols
 )
 {
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -1461,16 +1461,7 @@ public:
            umat_blobs.resize(n);
            for (size_t i = 0; i < n; i++)
            {
-                if (use_half)
+                inputs[i + 1].copyTo(umat_blobs[i]);
                {
                    Mat matFP32;
                    convertFp16(inputs[i + 1], matFP32);
                    matFP32.copyTo(umat_blobs[i]);
                }
                else
                {
                    inputs[i + 1].copyTo(umat_blobs[i]);
                }
            }
            inputs.resize(1);
        }
@ -1481,7 +1472,10 @@ public:
            umat_blobs.resize(n);
            for (size_t i = 0; i < n; i++)
            {
-                blobs[i].copyTo(umat_blobs[i]);
+                if (use_half)
                    convertFp16(blobs[i], umat_blobs[i]);
                else
                    blobs[i].copyTo(umat_blobs[i]);
            }
        }
@ -1537,14 +1531,20 @@ public:
        if (fusedWeights)
        {
-            weightsMat.copyTo(umat_blobs[0]);
+            if (use_half)
                convertFp16(weightsMat, umat_blobs[0]);
            else
                weightsMat.copyTo(umat_blobs[0]);
            fusedWeights = false;
        }
        if (fusedBias)
        {
            if ( umat_blobs.size() < 2 )
                umat_blobs.resize(2);
-            umat_blobs[1] = UMat(biasvec, true);
+            if (use_half)
                convertFp16(Mat(biasvec, true), umat_blobs[1]);
            else
                Mat(biasvec, true).copyTo(umat_blobs[1]);
            convolutionOp->setBias(true);
            fusedBias = false;
        }
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
        int32_t group_;
        bool bias_term_;
        UMat swizzled_weights_umat;
        UMat weights_half;
        UMat bias_half;
        UMat bottom_data2_;
        int32_t bottom_index_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
        fused_eltwise_ = false;
    }
-    if (use_half_ && bias_half.empty() && !bias.empty())
+    if (use_half_ && !bias.empty())
-        convertFp16(bias, bias_half);
+        CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
-    if (use_half_ && weights_half.empty())
+    if (use_half_)
-        convertFp16(weight, weights_half);
+        CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
-    prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
+    prepareKernel(bottom, top, weight, bias, numImages);
    if (bestKernelConfig.empty())
        return false;
-    return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
+    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
 }
 template<typename Dtype>
@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
                                     kernel_h_ * (int)alignSize(kernel_w_, 2),
                                     (use_half_) ? CV_16SC1 : CV_32FC1);
    UMat swizzled_weights_tmp;
    if (use_half_)
        swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
    if (!interleave) {
        cl_uint argIdx = 0;
        int32_t channels = channels_ / group_;
-        ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
+        ocl::Kernel oclk_copy_weight(
-                                     cv::ocl::dnn::conv_spatial_helper_oclsrc);
+            use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
            cv::ocl::dnn::conv_spatial_helper_oclsrc,
            use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
        );
        if (oclk_copy_weight.empty())
            return false;
-        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        oclk_copy_weight.args(
-        if (use_half_)
+            ocl::KernelArg::PtrReadOnly(weight),
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
+            ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
-        else
+            kernel_w_,
-            oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
+            kernel_h_,
-        oclk_copy_weight.set(argIdx++, kernel_w_);
+            channels,
-        oclk_copy_weight.set(argIdx++, kernel_h_);
+            num_output_,
-        oclk_copy_weight.set(argIdx++, channels);
+            swizzled_factor
-        oclk_copy_weight.set(argIdx++, num_output_);
+        );
        oclk_copy_weight.set(argIdx++, swizzled_factor);
        size_t global_work_size_copy[3] = {
            (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
        }
    } else {
        // assumption: kernel dimension is 2
-        Mat weightMat = weight.getMat(ACCESS_READ);
+        Mat weightMat;
        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
        Mat swizzledWeightMat;
        UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
        if (use_half_)
-            swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
+        {
            CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
            convertFp16(weight, weight_tmp);
            weightMat = weight_tmp.getMat(ACCESS_READ);
            swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
        }
        else
        {
            weightMat = weight.getMat(ACCESS_READ);
            swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
        }
        CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
        Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
        int interleavedRows = (kernel_w_ / 2) * 2;
@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
        int blockWidth = swizzled_factor;  // should equal to simd size.
        int rowAlignment = 32;
        size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
-        Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
+        cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
        CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
        for (int od = 0; od < M_; od++)
            for (int id = 0; id < channels_; id++)
                for (int r = 0; r < kernel_h_; r++)
                    for (int c = 0; c < kernel_w_; c++)
                        tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
                            cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
        interleaveMatrix(cpu_swizzled_weight,
-                         tmpSwizzledWeight,
+                         tmpSwizzledWeight.data(),
                         kernel_w_ * kernel_h_ * channels_, M_,
                         interleavedRows,
                         nonInterleavedRows,
                         blockWidth,
                         rowAlignment);
        free(tmpSwizzledWeight);
    }
-    if (use_half_)
+        // unmap OpenCL buffers
-        convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
+        weightMat.release();
        if (use_half_)
            convertFp16(swizzledWeightMat, swizzled_weights_umat);
    }
    return true;
 }
@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        cl_uint argIdx = 0;
        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-        if (use_half_)
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
        else
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
        if (bias_term_)
            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
        kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                kernel.set(argIdx++, image_offset);
-                if (use_half_)
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
                else
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
                kernel.set(argIdx++, kernel_offset);
                if (bias_term_)
                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
-    calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
+    calculateBenchmark(bottom, benchData, weight, bias, numImages);
    if (run_auto_tuning_ || force_auto_tuning_)
    {
--- a/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@ -39,9 +39,14 @@
 //
 //M*/
 #ifdef HALF_SUPPORT
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16:enable
 #endif
 #endif
 #define CONCAT(A,B) A##_##B
 #define TEMPLATE(name,type) CONCAT(name,type)
 #define Dtype float
 __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
    (__global Dtype* weightIn,