multi-threaded scatter and refactor perf

1 year ago · 63cde0b90d
parent a8ec658611
commit 63cde0b90d
2 changed files with 103 additions and 104 deletions
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -258,76 +258,71 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
    test_slice<4>(inputShape, begin, end);
 }

-struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none", int axis = 0)
-    {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
-
-        Mat data(shape, CV_32FC1);
-        Mat indices(shape, CV_32FC1);
-        Mat updates(shape, CV_32FC1);
-
-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randu(indices, 0, shape[axis]);
-        randn(updates, mean, std);
-
-        indices.convertTo(indices, CV_32SC1, 1, -1);
+using Layer_Scatter = TestBaseWithParam<tuple<std::vector<int>, std::string, int, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_Scatter, scatter) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int axis = get<2>(GetParam());
+    int backend_id = get<0>(get<3>(GetParam()));
+    int target_id = get<1>(get<3>(GetParam()));

-        Net net;
-        LayerParams lp;
-        lp.type = "Scatter";
-        lp.name = "testLayer";
-        lp.set("reduction", reduction);
-        lp.set("axis", axis);
+    Mat data(shape, CV_32FC1);
+    Mat indices(shape, CV_32FC1);
+    Mat updates(shape, CV_32FC1);

-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-        net.connect(0, 2, id, 2);
+    randn(data, 0.f, 1.f);
+    randu(indices, 0, shape[axis]);
+    randn(updates, 0.f, 1.f);

-        // warmup
-        {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
+    indices.convertTo(indices, CV_32SC1, 1, -1);

-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
-            Mat out = net.forward();
-        }
+    Net net;
+    LayerParams lp;
+    lp.type = "Scatter";
+    lp.name = "testLayer";
+    lp.set("reduction", reduction);
+    lp.set("axis", axis);

-        TEST_CYCLE()
-        {
-            Mat res = net.forward();
-        }
+    int id = net.addLayerToPrev(lp.name, lp.type, lp);
+    net.connect(0, 0, id, 0);
+    net.connect(0, 1, id, 1);
+    net.connect(0, 2, id, 2);

-        SANITY_CHECK_NOTHING();
+    // warmup
+    {
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
    }

-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
+    // perf
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }

-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter)
-{
-    test_layer({N, C, H, W});
+    SANITY_CHECK_NOTHING();
 }

-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add)
-{
-    test_layer({N, C, H, W}, "add");
-}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    Values(0), // use Values(0, 1, 2, 3) for more details
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));

 struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
 {
@ -800,7 +795,7 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
-INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+// INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
--- a/modules/dnn/src/layers/scatter_layer.cpp
+++ b/modules/dnn/src/layers/scatter_layer.cpp
@ -81,59 +81,63 @@ public:
    }

    template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
+    void forward_impl(const Functor& reduce_operation, const Mat& input_mat, const Mat& indices_mat, const Mat& updates_mat, Mat& output_mat)
    {
-        data.copyTo(out);
+        input_mat.copyTo(output_mat);

-        const int ndims = data.dims;
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const int ndims = input_mat.dims;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(ndims);

-        const int* ind_shape = indices.size.p;
-        const size_t* ind_step = indices.step.p;
+        const auto &indices_mat_shape = shape(indices_mat);
+        // const auto &indices_mat_step = indices_mat.step;
+        std::vector<size_t> indices_mat_step(ndims);

-        size_t inp_offset = 0;
-        size_t ind_offset = 0;
-        const T* p_index = indices.ptr<const T>();
-        const T* p_update = updates.ptr<const T>();
-        T* p_out = out.ptr<T>();
-
-        size_t total = indices.total();
+        for (int i = 0; i < ndims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+            indices_mat_step[i] = static_cast<size_t>(indices_mat.step.p[i] / sizeof(T));
+        }

-        int j, offset_at_idx, index;
-        size_t t, idx;
-        for (size_t i = 0; i < total; i++)
-        {
-            t = i;
-            inp_offset = 0;
-            ind_offset = 0;
-            int offset_at_axis = 0;
-            for (j = ndims - 1; j >= 0; j--)
-            {
-                idx = t / ind_shape[j];
-                offset_at_idx = (int)(t - idx * ind_shape[j]);
-                ind_offset += offset_at_idx * ind_step[j];
-                inp_offset += offset_at_idx * step[j];
-                t = idx;
-                if (j == axis)
-                {
-                    offset_at_axis = offset_at_idx * step[j];
+        const T* indices = indices_mat.ptr<const T>();
+        const T* updates = updates_mat.ptr<const T>();
+        T* output = output_mat.ptr<T>();
+
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0, indices_offset = 0;
+
+            int indices_index, index;
+            size_t axis_offset, tmp_index, j_index;
+            for (int i = r.start; i < r.end; i++) {
+                input_offset = 0;
+                indices_offset = 0;
+                indices_index = i;
+                axis_offset = 0;
+                for (int j = ndims - 1; j >= 0; j--) {
+                    tmp_index = indices_index / indices_mat_shape[j];
+                    j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]);
+                    input_offset += j_index * input_mat_step[j];
+                    indices_offset += j_index * indices_mat_step[j];
+                    indices_index = tmp_index;
+                    if (j == axis) {
+                        axis_offset = j_index * input_mat_step[j];
+                    }
                }
-            }
-            ind_offset /= sizeof(T);

-            // get index and overwrite current indices
-            const T* tmp_p_index = p_index + ind_offset;
-            index = (int)(*tmp_p_index);
-            CV_Assert(index < shape[axis] && index > -shape[axis]);
+                // get index and overwrite current indices
+                index = static_cast<int>(*(indices + indices_offset));
+                index = (index + input_mat_shape[axis]) % input_mat_shape[axis];
+                CV_Assert(index < input_mat_shape[axis] && index >= 0);
+                input_offset = input_offset - axis_offset + index * input_mat_step[axis];

-            inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis];
-            inp_offset /= sizeof(T);
+                const T* update = updates + indices_offset;
+                T* y = output + input_offset;
+                *y = reduce_operation(*y, *update);
+            }
+        };

-            const T* tmp_p_update = p_update + ind_offset;
-            T* tmp_p_out = p_out + inp_offset;
-            *tmp_p_out = rd(*tmp_p_out, *tmp_p_update);
-        }
+        size_t total = indices_mat.total();
+        double nstripes = (size_t)total * ndims * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
    }

    template<typename... Args>