From bf9e9b81ac2abbdd1a25ab5a589ae727a169c38d Mon Sep 17 00:00:00 2001
From: Vitaliy Lyudvichenko <ludv1x@yandex.ru>
Date: Wed, 27 Jul 2016 19:22:16 +0300
Subject: [PATCH] Adding of OCL and public interface for Softmax layer

---
 .../dnn/include/opencv2/dnn/all_layers.hpp    |   7 +
 modules/dnn/src/init.cpp                      |   2 +-
 modules/dnn/src/layers/pooling_layer.cpp      |   2 +-
 modules/dnn/src/layers/softmax_layer.cpp      | 218 ++++++++++++------
 modules/dnn/src/layers/softmax_layer.hpp      |  32 ++-
 modules/dnn/src/opencl/softmax.cl             |  75 ++++++
 modules/dnn/test/test_layers.cpp              |   2 +-
 7 files changed, 259 insertions(+), 79 deletions(-)
 create mode 100644 modules/dnn/src/opencl/softmax.cl
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 91273fad8..1e3d1fc7b 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -260,6 +260,13 @@ namespace dnn
         static Ptr<PoolingLayer> create(int type = MAX, Size kernel = Size(2, 2), Size stride = Size(1, 1), Size pad = Size(0, 0));
     };
 
+    class CV_EXPORTS_W SoftmaxLayer : public Layer
+    {
+    public:
+
+        static Ptr<SoftmaxLayer> create(int axis = 1);
+    };
+
 //! @}
 //! @}
 
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 58e6e0d3f..eac3eb8a5 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -77,7 +77,7 @@ void initModule()
         return;
 
     REG_RUNTIME_LAYER_CLASS(Slice, SliceLayer)
-    REG_RUNTIME_LAYER_CLASS(Softmax, SoftMaxLayer)
+    REG_STATIC_LAYER_FUNC(Softmax, createSoftmaxLayerFromCaffe)
     REG_RUNTIME_LAYER_CLASS(Split, SplitLayer)
     REG_RUNTIME_LAYER_CLASS(Reshape, ReshapeLayer)
     REG_STATIC_LAYER_FUNC(Flatten, createFlattenLayer)
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 1906cd58b..cc32aaf40 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -269,7 +269,7 @@ Ptr<PoolingLayer> PoolingLayer::create(int type, Size kernel, Size stride, Size
 Ptr<Layer> createPoolingLayerFromCaffe(LayerParams &params)
 {
     int type;
-    Size kernel, pad, stride;
+    Size kernel, stride, pad;
 
     if (params.has("pool"))
     {
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index e90a590ec..4fbedaf63 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -42,6 +42,8 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "softmax_layer.hpp"
+#include <opencv2/core/ocl.hpp>
+#include "modules/dnn/opencl_kernels_dnn.hpp"
 #include <algorithm>
 #include <stdlib.h>
 using std::max;
@@ -50,95 +52,179 @@ namespace cv
 {
 namespace dnn
 {
-    //TODO: set default axis number to 1, and add custom shape length in FullyConnected
-    SoftMaxLayer::SoftMaxLayer(LayerParams &params) : Layer(params)
+
+SoftMaxLayerImpl::SoftMaxLayerImpl(int axis)
+{
+    axisRaw = axis;
+}
+
+void SoftMaxLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    CV_Assert(inputs.size() == 1);
+    axis = inputs[0]->canonicalAxis(axisRaw);
+
+    useOpenCL = ocl::useOpenCL();
+
+    BlobShape shape = inputs[0]->shape();
+    outerSize = shape.total(0, axis);
+    channels = shape[axis];
+    innerSize = shape.total(axis + 1);
+
+    int allocFlag = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
+    shape[axis] = 1;
+    buf.create(shape, inputs[0]->type(), allocFlag);
+
+    outputs.resize(1);
+    outputs[0].create(inputs[0]->shape(), inputs[0]->type(), allocFlag);
+}
+
+void SoftMaxLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
+    Blob &src = *inputs[0];
+    Blob &dst = outputs[0];
+
+    if (!useOpenCL)
+        forward_cpu(src, dst);
+    else
     {
-        //hotfix!!!
-        axis_ = params.get<int>("axis", 1);
+        CV_Assert(forward_ocl(src, dst));
     }
+}
 
-    void SoftMaxLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        CV_Assert(inputs.size() == 1);
-        axis = inputs[0]->canonicalAxis(axis_);
+#ifdef HAVE_OPENCL
+bool SoftMaxLayerImpl::forward_ocl(Blob &src, Blob &dst)
+{
+    const UMat &srcMat = src.umatRefConst();
+    UMat &dstMat = dst.umatRef();
+    srcMat.copyTo(dstMat);
+    UMat &bufMat = buf.umatRef();
+    CV_Assert(dstMat.offset == 0);
 
-        BlobShape shape = inputs[0]->shape();
-        outputs.resize(1);
-        outputs[0].create(shape);
+    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+    ocl::Kernel kmax, ksub, ksum, kdiv;
 
-        shape[axis] = 1;
-        maxAggregator.create(shape);
-    }
+    if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;
 
-    void SoftMaxLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        Blob &src = *inputs[0];
-        Blob &dst = outputs[0];
+    if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;
 
-        float *srcPtr = src.ptrf();
-        float *dstPtr = dst.ptrf();
-        float *bufPtr = maxAggregator.ptrf();
+    if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;
 
-        size_t outerSize = src.total(0, axis);
-        size_t channels = src.size(axis);
-        size_t innerSize = src.total(axis + 1);
+    if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+        return false;
 
-        size_t outerStep = src.total(axis);
-        size_t cnStep = src.total(axis + 1);
+    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t bufSize = buf.total();
+    size_t totalSize = src.total();
 
-        //compute max along axis
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
+    kmax.args((int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+    if (!kmax.run(1, &bufSize, &wgSize, true))
+        return false;
+
+    ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+    if (!ksub.run(1, &totalSize, &wgSize, true))
+        return false;
+
+    cv::exp(dstMat, dstMat);
 
-            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+    ksum.args((int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+    if (!ksum.run(1, &bufSize, &wgSize, true))
+        return false;
 
-            for (size_t cnDim = 1; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
-            }
+    kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+    if (!kdiv.run(1, &totalSize, &wgSize, true))
+        return false;
+
+    return true;
+}
+#else
+bool SoftMaxLayerImpl::forward_ocl(Blob&, Blob&)
+{
+    return false;
+}
+#endif
+
+void SoftMaxLayerImpl::forward_cpu(Blob &src, Blob &dst)
+{
+    CV_Assert(src.type() == CV_32F);
+
+    float *srcPtr = src.ptrf();
+    float *dstPtr = dst.ptrf();
+    float *bufPtr = buf.ptrf();
+
+    size_t outerStep = src.total(axis);
+    size_t cnStep = src.total(axis + 1);
+
+    //compute max along axis
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;
+
+        memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+
+        for (size_t cnDim = 1; cnDim < channels; cnDim++)
+        {
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
         }
+    }
+
+    //subtract max
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;
 
-        //subtract max
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
         {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
-            }
+            for (size_t i = 0; i < innerSize; i++)
+                dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
         }
+    }
 
-        cv::exp(dst.matRef(), dst.matRef());
+    cv::exp(dst.matRef(), dst.matRef());
 
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+    {
+        size_t srcOffset = outerDim * outerStep;
+        size_t bufOffset = outerDim * cnStep;
+
+        //sum exp along axis
+        for (size_t i = 0; i < innerSize; i++)
+            bufPtr[bufOffset + i] = 0.f;
+
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
         {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
+        }
 
-            //sum exp along axis
+        //divide by computed sum
+        for (size_t cnDim = 0; cnDim < channels; cnDim++)
+        {
             for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] = 0.f;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
-            }
-
-            //divide by computed sum
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
-            }
+                dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
         }
     }
+}
+
+Ptr<SoftmaxLayer> SoftmaxLayer::create(int axis)
+{
+    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(axis));
+}
+
+Ptr<Layer> createSoftmaxLayerFromCaffe(LayerParams &params)
+{
+    int axis = params.get<int>("axis", 1);
+    return Ptr<Layer>(SoftmaxLayer::create(axis));
+}
 
 }
 }
diff --git a/modules/dnn/src/layers/softmax_layer.hpp b/modules/dnn/src/layers/softmax_layer.hpp
index 5b55794ae..5803ee8eb 100644
--- a/modules/dnn/src/layers/softmax_layer.hpp
+++ b/modules/dnn/src/layers/softmax_layer.hpp
@@ -42,21 +42,33 @@
 #ifndef __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 
 namespace cv
 {
 namespace dnn
 {
-    class SoftMaxLayer : public Layer
-    {
-        int axis_, axis;
-        Blob maxAggregator;
-
-    public:
-        SoftMaxLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
+
+class SoftMaxLayerImpl : public SoftmaxLayer
+{
+    int axis, axisRaw;
+    Blob buf;
+    bool useOpenCL;
+    size_t outerSize, channels, innerSize;
+
+
+    bool forward_ocl(Blob &src, Blob &dst);
+    void forward_cpu(Blob &src, Blob &dst);
+
+public:
+
+    SoftMaxLayerImpl(int axis = 1);
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
+
+Ptr<Layer> createSoftmaxLayerFromCaffe(LayerParams &params);
+
 }
 }
 #endif
diff --git a/modules/dnn/src/opencl/softmax.cl b/modules/dnn/src/opencl/softmax.cl
new file mode 100644
index 000000000..e9fcadce3
--- /dev/null
+++ b/modules/dnn/src/opencl/softmax.cl
@@ -0,0 +1,75 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
+}
+
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
\ No newline at end of file
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 9958076e7..94d8945cf 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -87,7 +87,7 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool
 
 TEST(Layer_Test_Softmax, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_softmax");
+     OCL_OFF(testLayerUsingCaffeModels("layer_softmax"));
 }
 OCL_TEST(Layer_Test_Softmax, Accuracy)
 {