From fd0941356627794efc43ceff5ce6116b871d369a Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 4 Mar 2020 22:28:04 +0300
Subject: [PATCH] Merge pull request #16731 from alalek:issue_16708

* imgproc(integral): avoid OOB access

* imgproc(test): fix integral perf check

- FP32 computation is not accurate

* imgproc(integral): tune loop limits
---
 modules/imgproc/perf/perf_integral.cpp | 15 ++++++++++++++-
 modules/imgproc/src/sumpixels.simd.hpp | 18 +++++++++++++++---
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp
index 59a5060937..2b1ab381e7 100644
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -39,10 +39,23 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral,
     Mat sum(sz, sdepth);
 
     declare.in(src, WARMUP_RNG).out(sum);
+    if (sdepth == CV_32F)
+        src *= (1 << 23) / (double)(sz.area() * 256);  // FP32 calculations are not accurate (mantissa is 23-bit)
 
     TEST_CYCLE() integral(src, sum, sdepth);
 
-    SANITY_CHECK(sum, 1e-6);
+    Mat src_roi; src(Rect(src.cols - 4, src.rows - 4, 4, 4)).convertTo(src_roi, sdepth);
+    Mat restored_src_roi =
+           sum(Rect(sum.cols - 4, sum.rows - 4, 4, 4)) + sum(Rect(sum.cols - 5, sum.rows - 5, 4, 4)) -
+           sum(Rect(sum.cols - 4, sum.rows - 5, 4, 4)) - sum(Rect(sum.cols - 5, sum.rows - 4, 4, 4));
+    EXPECT_EQ(0, cvtest::norm(restored_src_roi, src_roi, NORM_INF))
+        << src_roi << endl << restored_src_roi << endl
+        << sum(Rect(sum.cols - 4, sum.rows - 4, 4, 4));
+
+    if (sdepth == CV_32F)
+        SANITY_CHECK_NOTHING();
+    else
+        SANITY_CHECK(sum, 1e-6);
 }
 
 PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum,
diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp
index 2ac02a0c3c..f5f3a92d85 100644
--- a/modules/imgproc/src/sumpixels.simd.hpp
+++ b/modules/imgproc/src/sumpixels.simd.hpp
@@ -237,7 +237,11 @@ struct Integral_SIMD<uchar, int, double>
                 v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32(),
                         prev_3 = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                const int j_max =
+                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
+                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
+                        : width - v_uint16::nlanes * cn;  // v_expand_low
+                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -546,7 +550,11 @@ struct Integral_SIMD<uchar, float, double>
                 v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32(),
                           prev_3 = vx_setzero_f32();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                const int j_max =
+                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
+                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
+                        : width - v_uint16::nlanes * cn;  // v_expand_low
+                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -896,7 +904,11 @@ struct Integral_SIMD<uchar, double, double>
                 v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64(),
                           prev_3 = vx_setzero_f64();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                const int j_max =
+                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
+                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
+                        : width - v_uint16::nlanes * cn;  // v_expand_low
+                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);