moved integral to gpuarithm module

12 years ago · c56bdbc1c5
parent 10ac854358
commit c56bdbc1c5
11 changed files with 236 additions and 219 deletions
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@ -168,18 +168,6 @@ CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K
 CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

-//! computes the integral image
-//! sum will have CV_32S type, but will contain unsigned int values
-//! supports only CV_8UC1 source type
-CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
-//! buffered version
-CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
-
-//! computes squared integral image
-//! result matrix will have 64F type, but will contain 64U values
-//! supports source images of 8UC1 type only
-CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
-
 //! computes Harris cornerness criteria at each image pixel
 CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
 CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -363,63 +363,6 @@ PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
    }
 }

-//////////////////////////////////////////////////////////////////////
-// Integral
-
-PERF_TEST_P(Sz, ImgProc_Integral,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
-
-        TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::integral(src, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// IntegralSqr
-
-PERF_TEST_P(Sz, ImgProc_IntegralSqr,
-            GPU_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    cv::Mat src(size, CV_8UC1);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // HistEvenC1

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -55,9 +55,6 @@ void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&,
 void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_no_cuda(); }
 void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::histEven(const GpuMat&, GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
@ -412,111 +409,6 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
 }

-////////////////////////////////////////////////////////////////////////
-// integral
-
-void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
-{
-    GpuMat buffer;
-    integralBuffered(src, sum, buffer, s);
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    cv::Size whole;
-    cv::Point offset;
-
-    src.locateROI(whole, offset);
-
-    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
-    {
-        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
-
-        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
-
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        if (s)
-            s.enqueueMemSet(sum, Scalar::all(0));
-        else
-            sum.setTo(Scalar::all(0));
-
-        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
-        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
-
-        if (s)
-            s.enqueueCopy(res, inner);
-        else
-            res.copyTo(inner);
-    }
-    else
-    {
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-
-        NcvSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-        Ncv32u bufSize;
-        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
-
-
-        NppStStreamHandler h(stream);
-
-        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// sqrIntegral
-
-void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
-{
-    CV_Assert(src.type() == CV_8U);
-
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
-
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-    Ncv32u bufSize;
-    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
-    GpuMat buf(1, bufSize, CV_8U);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStStreamHandler h(stream);
-
-    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
-            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-

 ////////////////////////////////////////////////////////////////////////
 // Histogram
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -46,43 +46,6 @@

 using namespace cvtest;

-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Integral
-
-PARAM_TEST_CASE(Integral, cv::gpu::DeviceInfo, cv::Size, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Integral, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_8UC1);
-
-    cv::gpu::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
-    cv::gpu::integral(loadMat(src, useRoi), dst);
-
-    cv::Mat dst_gold;
-    cv::integral(src, dst_gold, CV_32S);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Integral, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    WHOLE_SUBMAT));
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HistEven

--- a/modules/gpuarithm/CMakeLists.txt
+++ b/modules/gpuarithm/CMakeLists.txt
@ -6,7 +6,7 @@ set(the_description "GPU-accelerated Operations on Matrices")

 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations)

-ocv_define_module(gpuarithm opencv_core OPTIONAL opencv_imgproc)
+ocv_define_module(gpuarithm opencv_core OPTIONAL opencv_gpunvidia opencv_imgproc)

 if(HAVE_CUBLAS)
  CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@ -283,6 +283,18 @@ CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, co
 CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType,
                               const Scalar& value = Scalar(), Stream& stream = Stream::Null());

+//! computes the integral image
+//! sum will have CV_32S type, but will contain unsigned int values
+//! supports only CV_8UC1 source type
+CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
+//! buffered version
+CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
+
+//! computes squared integral image
+//! result matrix will have 64F type, but will contain 64U values
+//! supports source images of 8UC1 type only
+CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
+
 }} // namespace cv { namespace gpu {

 #endif /* __OPENCV_GPUARITHM_HPP__ */
--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@ -2156,11 +2156,11 @@ PERF_TEST_P(Sz_Depth_NormType, Core_Normalize,
    }
 }

+#ifdef HAVE_OPENCV_IMGPROC
+
 //////////////////////////////////////////////////////////////////////
 // CopyMakeBorder

-#ifdef HAVE_OPENCV_IMGPROC
-
 DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);

 PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder,
@ -2198,4 +2198,61 @@ PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder,
    }
 }

+//////////////////////////////////////////////////////////////////////
+// Integral
+
+PERF_TEST_P(Sz, ImgProc_Integral,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::integral(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PERF_TEST_P(Sz, ImgProc_IntegralSqr,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
 #endif
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@ -61,6 +61,9 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
 void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
+void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

@ -718,4 +721,119 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
    }
 }

+////////////////////////////////////////////////////////////////////////
+// integral
+
+void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
+{
+    GpuMat buffer;
+    integralBuffered(src, sum, buffer, s);
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    cv::Size whole;
+    cv::Point offset;
+
+    src.locateROI(whole, offset);
+
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
+
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        if (s)
+            s.enqueueMemSet(sum, Scalar::all(0));
+        else
+            sum.setTo(Scalar::all(0));
+
+        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
+        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
+
+        if (s)
+            s.enqueueCopy(res, inner);
+        else
+            res.copyTo(inner);
+    }
+    else
+    {
+#ifndef HAVE_OPENCV_GPUNVIDIA
+    throw_no_cuda();
+#else
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+#endif
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
+{
+#ifndef HAVE_OPENCV_GPUNVIDIA
+    (void) src;
+    (void) sqsum;
+    (void) s;
+    throw_no_cuda();
+#else
+    CV_Assert(src.type() == CV_8U);
+
+    NcvSize32u roiSize;
+    roiSize.width = src.cols;
+    roiSize.height = src.rows;
+
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
+    GpuMat buf(1, bufSize, CV_8U);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStStreamHandler h(stream);
+
+    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
+            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#endif
+}
+
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpuarithm/src/cuda/integral.cu
+++ b/modules/gpuarithm/src/cuda/integral.cu
--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
@ -51,6 +51,13 @@

 #include "opencv2/core/gpu_private.hpp"

+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_GPUNVIDIA
+#  include "opencv2/gpunvidia.hpp"
+#  include "opencv2/gpunvidia/private.hpp"
+#endif
+
 #ifdef HAVE_CUBLAS
    #include <cublas.h>
 #endif
--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@ -3607,11 +3607,11 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Normalize, testing::Combine(
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
    WHOLE_SUBMAT));

+#ifdef HAVE_OPENCV_IMGPROC
+
 //////////////////////////////////////////////////////////////////////////////
 // CopyMakeBorder

-#ifdef HAVE_OPENCV_IMGPROC
-
 namespace
 {
    IMPLEMENT_PARAM_CLASS(Border, int)
@ -3669,6 +3669,43 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    ALL_BORDER_TYPES,
    WHOLE_SUBMAT));

+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Integral
+
+PARAM_TEST_CASE(Integral, cv::gpu::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Integral, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
+    cv::gpu::integral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::integral(src, dst_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Integral, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
 #endif

 #endif // HAVE_CUDA