Fix a bug of ocl retina on NVIDIA platform.

12 years ago · e45f92a9e1
parent 009919872f
commit e45f92a9e1
3 changed files with 47 additions and 16 deletions
--- a/modules/bioinspired/src/opencl/retina_kernel.cl
+++ b/modules/bioinspired/src/opencl/retina_kernel.cl
@ -114,19 +114,34 @@ kernel void horizontalAnticausalFilter(
    global float * optr = output +
                          mad24(gid + 1, elements_per_row, - 1 + out_offset / 4);

-    float4 result = (float4)(0), out_v4;
+    float4 result_v4 = (float4)(0), out_v4;
+    float result = 0;
    // we assume elements_per_row is multple of 4
-    for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4)
+    for(int i = 0; i < 4; ++ i, -- optr)
+    {
+        if(i < elements_per_row - cols)
+        {
+            *optr = result;
+        }
+        else
+        {
+            result = *optr + _a * result;
+            *optr = result;
+        }
+    }
+    result_v4.x = result;
+    optr -= 3;
+    for(int i = 1; i < elements_per_row / 4; ++i, optr -= 4)
    {
        // shift left, `offset` is type `size_t` so it cannot be negative
-        out_v4   = vload4(0, optr - 3);
+        out_v4 = vload4(0, optr);

-        result.w = out_v4.w + _a * result.x;
-        result.z = out_v4.z + _a * result.w;
-        result.y = out_v4.y + _a * result.z;
-        result.x = out_v4.x + _a * result.y;
+        result_v4.w = out_v4.w + _a * result_v4.x;
+        result_v4.z = out_v4.z + _a * result_v4.w;
+        result_v4.y = out_v4.y + _a * result_v4.z;
+        result_v4.x = out_v4.x + _a * result_v4.y;

-        vstore4(result, 0, optr - 3);
+        vstore4(result_v4, 0, optr);
    }
 }

@ -207,18 +222,34 @@ kernel void horizontalAnticausalFilter_Irregular(
        buffer + mad24(rows - gid, elements_per_row, -1 + buffer_offset / 4);

    float4 buf_v4, out_v4, res_v4 = (float4)(0);
-
-    for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
-    {
-        buf_v4 = vload4(0, bptr - 3);
-        out_v4 = vload4(0, optr - 3);
+    float result = 0;
+    // we assume elements_per_row is multple of 4
+    for(int i = 0; i < 4; ++ i, -- optr, -- bptr)
+    {
+        if(i < elements_per_row - cols)
+        {
+            *optr = result;
+        }
+        else
+        {
+            result = *optr + *bptr * result;
+            *optr = result;
+        }
+    }
+    res_v4.x = result;
+    optr -= 3;
+    bptr -= 3;
+    for(int i = 0; i < elements_per_row / 4 - 1; ++i, optr -= 4, bptr -= 4)
+    {
+        buf_v4 = vload4(0, bptr);
+        out_v4 = vload4(0, optr);

        res_v4.w = out_v4.w + buf_v4.w * res_v4.x;
        res_v4.z = out_v4.z + buf_v4.z * res_v4.w;
        res_v4.y = out_v4.y + buf_v4.y * res_v4.z;
        res_v4.x = out_v4.x + buf_v4.x * res_v4.y;

-        vstore4(res_v4, 0, optr - 3);
+        vstore4(res_v4, 0, optr);
    }
 }

--- a/modules/bioinspired/src/retina_ocl.cpp
+++ b/modules/bioinspired/src/retina_ocl.cpp
@ -1149,7 +1149,7 @@ void RetinaColor::_initColorSampling()
    // computing photoreceptors local density
    MAKE_OCLMAT_SLICES(_RGBmosaic, 3);
    MAKE_OCLMAT_SLICES(_colorLocalDensity, 3);
-
+    _colorLocalDensity.setTo(0);
    _spatiotemporalLPfilter(_RGBmosaic_slices[0], _colorLocalDensity_slices[0]);
    _spatiotemporalLPfilter(_RGBmosaic_slices[1], _colorLocalDensity_slices[1]);
    _spatiotemporalLPfilter(_RGBmosaic_slices[2], _colorLocalDensity_slices[2]);
--- a/modules/bioinspired/test/test_retina_ocl.cpp
+++ b/modules/bioinspired/test/test_retina_ocl.cpp
@ -49,7 +49,7 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/highgui.hpp"

-#if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL)
+#if defined(HAVE_OPENCV_OCL)

 #include "opencv2/ocl.hpp"
 #define RETINA_ITERATIONS 5