fixed several problems with CUDA 5.0

* gpu::LUT, uses device memory instead of host memory * gpu::multiply, round mod for CV_8U depth
13 years ago · 2582464e51
parent 71625ad458
commit 2582464e51
4 changed files with 88 additions and 47 deletions
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -320,12 +320,23 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
        const Npp32s* pLevels3[3];
        int nValues3[3];

+#if (CUDA_VERSION > 4020)
+        GpuMat d_pLevels;
+#endif
+
        LevelsInit()
        {
            nValues3[0] = nValues3[1] = nValues3[2] = 256;
            for (int i = 0; i < 256; ++i)
                pLevels[i] = i;
+
+
+#if (CUDA_VERSION <= 4020)
            pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+#else
+            d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+            pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+#endif
        }
    };
    static LevelsInit lvls;
@ -350,22 +361,48 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)

    if (src.type() == CV_8UC1)
    {
+#if (CUDA_VERSION <= 4020)
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+#else
+        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
+#endif
    }
    else
    {
-        Mat nppLut3[3];
        const Npp32s* pValues3[3];
+
+        Mat nppLut3[3];
        if (nppLut.channels() == 1)
+        {
+#if (CUDA_VERSION <= 4020)
            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
+#else
+            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
+#endif
+        }
        else
        {
            cv::split(nppLut, nppLut3);
+
+#if (CUDA_VERSION <= 4020)
            pValues3[0] = nppLut3[0].ptr<Npp32s>();
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
+#else
+            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
+            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
+            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
+
+            pValues3[0] = d_nppLut0.ptr<Npp32s>();
+            pValues3[1] = d_nppLut1.ptr<Npp32s>();
+            pValues3[2] = d_nppLut2.ptr<Npp32s>();
+#endif
        }
+
        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -658,7 +658,11 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub

        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));

+#if (CUDA_VERSION <= 4020)
        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
+#else
+        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F && src1.depth() > CV_8U)
+#endif
        {
            npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
            return;
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -1189,18 +1189,18 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, AbsDiff, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Abs

-PARAM_TEST_CASE(Abs, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(Abs, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
-    int type;
+    int depth;
    bool useRoi;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
-        type = GET_PARAM(2);
+        depth = GET_PARAM(2);
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
@ -1209,9 +1209,9 @@ PARAM_TEST_CASE(Abs, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)

 TEST_P(Abs, Accuracy)
 {
-    cv::Mat src = randomMat(size, type);
+    cv::Mat src = randomMat(size, depth);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
    cv::gpu::abs(loadMat(src, useRoi), dst);

    cv::Mat dst_gold = cv::abs(src);
@ -1222,24 +1222,24 @@ TEST_P(Abs, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_16SC1), MatType(CV_32FC1)),
+    testing::Values(MatDepth(CV_16S), MatDepth(CV_32F)),
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // Sqr

-PARAM_TEST_CASE(Sqr, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(Sqr, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
-    int type;
+    int depth;
    bool useRoi;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
-        type = GET_PARAM(2);
+        depth = GET_PARAM(2);
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
@ -1248,9 +1248,9 @@ PARAM_TEST_CASE(Sqr, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)

 TEST_P(Sqr, Accuracy)
 {
-    cv::Mat src = randomMat(size, type);
+    cv::Mat src = randomMat(size, depth, 0, depth == CV_8U ? 16 : 255);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
    cv::gpu::sqr(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
@ -1262,10 +1262,10 @@ TEST_P(Sqr, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1),
-                    MatType(CV_16UC1),
-                    MatType(CV_16SC1),
-                    MatType(CV_32FC1)),
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
@ -1295,18 +1295,18 @@ void sqrtGold(const cv::Mat& src, cv::Mat& dst)
    funcs[src.depth()](src, dst);
 }

-PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
-    int type;
+    int depth;
    bool useRoi;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
-        type = GET_PARAM(2);
+        depth = GET_PARAM(2);
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
@ -1315,24 +1315,24 @@ PARAM_TEST_CASE(Sqrt, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)

 TEST_P(Sqrt, Accuracy)
 {
-    cv::Mat src = randomMat(size, type);
+    cv::Mat src = randomMat(size, depth);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
    cv::gpu::sqrt(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    sqrtGold(src, dst_gold);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1),
-                    MatType(CV_16UC1),
-                    MatType(CV_16SC1),
-                    MatType(CV_32FC1)),
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
@ -1362,18 +1362,18 @@ void logGold(const cv::Mat& src, cv::Mat& dst)
    funcs[src.depth()](src, dst);
 }

-PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
-    int type;
+    int depth;
    bool useRoi;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
-        type = GET_PARAM(2);
+        depth = GET_PARAM(2);
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
@ -1382,24 +1382,24 @@ PARAM_TEST_CASE(Log, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)

 TEST_P(Log, Accuracy)
 {
-    cv::Mat src = randomMat(size, type, 1.0, 255.0);
+    cv::Mat src = randomMat(size, depth, 1.0, 255.0);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
    cv::gpu::log(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    logGold(src, dst_gold);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-6);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1),
-                    MatType(CV_16UC1),
-                    MatType(CV_16SC1),
-                    MatType(CV_32FC1)),
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
@ -1439,18 +1439,18 @@ void expGold(const cv::Mat& src, cv::Mat& dst)
    funcs[src.depth()](src, dst);
 }

-PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
-    int type;
+    int depth;
    bool useRoi;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
-        type = GET_PARAM(2);
+        depth = GET_PARAM(2);
        useRoi = GET_PARAM(3);

        cv::gpu::setDevice(devInfo.deviceID());
@ -1459,24 +1459,24 @@ PARAM_TEST_CASE(Exp, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)

 TEST_P(Exp, Accuracy)
 {
-    cv::Mat src = randomMat(size, type, 0.0, 10.0);
+    cv::Mat src = randomMat(size, depth, 0.0, 10.0);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
    cv::gpu::exp(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    expGold(src, dst_gold);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-2);
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-2);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1),
-                    MatType(CV_16UC1),
-                    MatType(CV_16SC1),
-                    MatType(CV_32FC1)),
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
@ -311,7 +311,7 @@ TEST_P(ConvertTo, WithScaling)
        cv::Mat dst_gold;
        src.convertTo(dst_gold, depth2, a, b);

-        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 0.0 : 1e-4);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 1.0 : 1e-4);
    }
 }