fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)

14 years ago · 924670d32c
parent d03b89f163
commit 924670d32c
4 changed files with 37 additions and 25 deletions
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf

 	        hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
        }
-    }   
-
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)
+    }
+    
+    dim3 calcBlockSize(int nIntervals)
    {
-        dim3 threads;
-        threads.x = 16;
-        threads.y = 8;
+        int threadsPerBlock = 512;
+        
+        dim3 threads;        
        threads.z = nIntervals;
+        threadsPerBlock /= nIntervals;
+        if (threadsPerBlock >= 48)
+            threads.x = 16;
+        else
+            threads.x = 8;
+        threadsPerBlock /= threads.x;
+        threads.y = threadsPerBlock;
+        
+        return threads;
+    }

+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
+    {
        dim3 grid;
        grid.x = divUp(x_size, threads.x);
        grid.y = divUp(y_size, threads.y);
-        grid.z = 1;
-
+        
  	    fasthessian<<<grid, threads>>>(hessianBuffer);

        cudaSafeCall( cudaThreadSynchronize() );
@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
    }

    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask)
+        int x_size, int y_size, bool use_mask, const dim3& threads)
    {
-        dim3 threads;
-        threads.x = 16;
-        threads.y = 8;
-        threads.z = nIntervals;
-
        dim3 grid;
        grid.x = divUp(x_size, threads.x - 2);
        grid.y = divUp(y_size, threads.y - 2);
-        grid.z = 1;

        const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);

@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
    
        dim3 grid;
        grid.x = maxCounter;
-        grid.y = 1; 
-        grid.z = 1;

        DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
    
@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
 	    // - SURF says to only use a circle, but the branching logic would slow it down
 	    // - Gaussian weighting should reduce the effects of the outer points anyway
        if (tid2 < 169)
+
        {
 	        dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
 	        dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf

        dim3 grid;
        grid.x = nFeatures;
-        grid.y = 1;
-        grid.z = 1;

        find_orientation<<<grid, threads>>>(features);
        cudaSafeCall( cudaThreadSynchronize() );
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
 #else /* !defined (HAVE_CUDA) */

 namespace cv { namespace gpu { namespace surf
-{    
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);
+{
+    dim3 calcBlockSize(int nIntervals);
+    
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
    
    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int nIntervals, int x_size, int y_size, bool use_mask);
+        int x_size, int y_size, bool use_mask, const dim3& threads);
    
    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
@ -103,7 +105,7 @@ namespace
        {
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-            CV_Assert(nOctaves > 0 && nIntervals > 2);
+            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
            CV_Assert(DeviceInfo().has(ATOMICS));

            max_features = static_cast<int>(img.size().area() * featuresRatio);
@ -168,6 +170,7 @@ namespace

        void detectKeypoints(GpuMat& keypoints)
        {
+            dim3 threads = calcBlockSize(nIntervals);
            for(int octave = 0; octave < nOctaves; ++octave)
            {
                int step = initialStep * (1 << octave);
@ -189,12 +192,12 @@ namespace
                uploadConstant("cv::gpu::surf::c_border", border);
                uploadConstant("cv::gpu::surf::c_step",   step);

-                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);
+                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);

                // Reset the candidate count.
                maxCounter = 0;

-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); 
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); 
                
                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));

--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@ -38,6 +38,9 @@ int main(int argc, char* argv[])
    GpuMat descriptors1GPU, descriptors2GPU;
    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+    
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;

    // matching descriptors
    BruteForceMatcher_GPU< L2<float> > matcher;
@ -57,6 +60,8 @@ int main(int argc, char* argv[])
    // drawing the results
    Mat img_matches;
    drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+    
+    namedWindow("matches", 0);
    imshow("matches", img_matches);
    waitKey(0);

--- a/tests/gpu/src/features2d.cpp
+++ b/tests/gpu/src/features2d.cpp
@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
        assert(minDist >= 0);
        if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
        {
+            ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");
            ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
            return;
        }

        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)
        {
+            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");
            ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );
            return;
        }