From 8a799aa89a81ec75b5261d6abc180f4c56393052 Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Mon, 3 Oct 2011 14:05:52 +0000
Subject: [PATCH] Updated optimal block size estimation for the convolve()
 function

---
 modules/gpu/perf/perf_imgproc.cpp       | 12 +++++++-----
 modules/gpu/perf/perf_utility.hpp       |  1 +
 modules/gpu/src/imgproc.cpp             | 11 ++++++++---
 samples/gpu/performance/performance.cpp | 21 ++++++++++++++-------
 samples/gpu/performance/performance.h   |  8 +++++++-
 5 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index f239edb725..81dd559543 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -735,16 +735,18 @@ PERF_TEST_P(DevInfo_Size, dft, testing::Combine(testing::ValuesIn(devices()),
     SANITY_CHECK(dst_host);
 }
 
-PERF_TEST_P(DevInfo_Size, convolve, testing::Combine(testing::ValuesIn(devices()),
-                                                testing::Values(GPU_TYPICAL_MAT_SIZES)))
+PERF_TEST_P(DevInfo_Int_Int, convolve, testing::Combine(testing::ValuesIn(devices()),
+                                                     testing::Values(512, 1024, 1536, 2048, 2560, 3072, 3584),
+                                                     testing::Values(27, 32, 64)))
 {
     DeviceInfo devInfo = std::tr1::get<0>(GetParam());
-    Size size = std::tr1::get<1>(GetParam());
+    int image_size = std::tr1::get<1>(GetParam());
+    int templ_size = std::tr1::get<2>(GetParam());
 
     setDevice(devInfo.deviceID());
 
-    Mat image_host(size, CV_32FC1);
-    Mat templ_host(size, CV_32FC1);
+    Mat image_host(image_size, image_size, CV_32FC1);
+    Mat templ_host(templ_size, templ_size, CV_32FC1);
 
     declare.in(image_host, templ_host, WARMUP_RNG);
 
diff --git a/modules/gpu/perf/perf_utility.hpp b/modules/gpu/perf/perf_utility.hpp
index a57e3671b2..17f9418136 100644
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@@ -32,6 +32,7 @@ struct CvtColorInfo
 
 typedef TestBaseWithParam<DeviceInfo> DevInfo;
 typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, Size> > DevInfo_Size;
+typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_Int_Int;
 typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, MatType> > DevInfo_MatType;
 typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, Size, MatType> > DevInfo_Size_MatType;
 typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, Size, MatType, MatType> > DevInfo_Size_MatType_MatType;
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 8b86ce61f1..47b09986a4 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -1546,18 +1546,23 @@ void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
 Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size)
 {
     int scale = 40;
-    Size bsize_min(1024, 1024);
+    Size bsize_min(512, 512);
 
     // Check whether we use Fermi generation or newer GPU
     if (DeviceInfo().majorVersion() >= 2)
     {
-        bsize_min.width = 2048;
-        bsize_min.height = 2048;
+        bsize_min.width = 1024;
+        bsize_min.height = 1024;
     }
 
     Size bsize(std::max(templ_size.width * scale, bsize_min.width),
                std::max(templ_size.height * scale, bsize_min.height));
 
+    int blocks_per_row = (result_size.width + bsize.width - 1) / bsize.width;
+    int blocks_per_col = (result_size.height + bsize.height - 1) / bsize.height;
+    bsize.width = (result_size.width + blocks_per_row - 1) / blocks_per_row;
+    bsize.height = (result_size.height + blocks_per_col - 1) / blocks_per_col;
+
     bsize.width = std::min(bsize.width, result_size.width);
     bsize.height = std::min(bsize.height, result_size.height);
     return bsize;
diff --git a/samples/gpu/performance/performance.cpp b/samples/gpu/performance/performance.cpp
index b9bbc85c11..6b1619e84b 100644
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@@ -8,9 +8,15 @@ using namespace cv;
 
 void TestSystem::run()
 {
-    // Run test initializers
-    vector<Runnable*>::iterator it = inits_.begin();
-    for (; it != inits_.end(); ++it)
+    if (is_list_mode_)
+    {
+        for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+            cout << (*it)->name() << endl;
+        return;
+    }
+
+    // Run test initializers    
+    for (vector<Runnable*>::iterator it = inits_.begin(); it != inits_.end(); ++it)
     {
         if ((*it)->name().find(test_filter_, 0) != string::npos)
             (*it)->run();
@@ -19,8 +25,7 @@ void TestSystem::run()
     printHeading();
 
     // Run tests
-    it = tests_.begin();
-    for (; it != tests_.end(); ++it)
+    for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
     {
         try
         {
@@ -145,13 +150,15 @@ int main(int argc, char** argv)
         string key = argv[i];
         if (key == "--help")
         {
-            cout << "Usage: performance_gpu [--filter <test_filter>] [--working-dir <working_dir_with_slash>]\n";
+            cout << "Usage: performance_gpu [--ls] [--filter <test_filter>] [--workdir <working_dir_with_slash>]\n";
             return 0;
         }
         if (key == "--filter" && i + 1 < argc)
             TestSystem::instance().setTestFilter(argv[++i]);
-        else if (key == "--working-dir" && i + 1 < argc)
+        else if (key == "--workdir" && i + 1 < argc)
             TestSystem::instance().setWorkingDir(argv[++i]);
+        else if (key == "--ls")
+            TestSystem::instance().setListMode(true);
         else 
         {
             cout << "Unknown parameter: '" << key << "'" << endl;
diff --git a/samples/gpu/performance/performance.h b/samples/gpu/performance/performance.h
index 0031950475..007309bcf0 100644
--- a/samples/gpu/performance/performance.h
+++ b/samples/gpu/performance/performance.h
@@ -68,10 +68,14 @@ public:
         cur_subtest_is_empty_ = false;
     }
 
+    bool isListMode() const { return is_list_mode_; }
+    void setListMode(bool value) { is_list_mode_ = value; }
+
 private:
     TestSystem(): cur_subtest_is_empty_(true), cpu_elapsed_(0),
                   gpu_elapsed_(0), speedup_total_(0.0),
-                  num_subtests_called_(0) {}
+                  num_subtests_called_(0),
+                  is_list_mode_(false) {}
 
     void finishCurrentSubtest();
     void resetCurrentSubtest() 
@@ -100,6 +104,8 @@ private:
 
     double speedup_total_;
     int num_subtests_called_;
+
+    bool is_list_mode_;
 };