imgproc(resize): improve 8u3 HResize vector exit calc

Actually, we can do this in constant time. xofs always contains same or increasing offset values. We can instead find the most extreme value used and never attempt to load it. Similarly, we can note for all dx >= 0 and dx < (dwidth - cn) where xofs[dx] + cn < xofs[dwidth-cn] implies dx < (dwidth - cn). Thus, we can use this to control our loop termination optimally. This fixes #16137 with little or no performance impact. I have also added a debug check as a sanity check.
5 years ago · c1cdb2416a
parent 40ac72a8f1
commit c1cdb2416a
1 changed files with 10 additions and 14 deletions
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@ -1526,7 +1526,7 @@ struct HResizeLinearVec_X4
 struct HResizeLinearVecU8_X4
 {
    int operator()(const uchar** src, int** dst, int count, const int* xofs,
-        const short* alpha/*[xmax]*/, int smax, int /*dmax*/, int cn, int /*xmin*/, int xmax) const
+        const short* alpha/*[xmax]*/, int /*smax*/, int dmax, int cn, int /*xmin*/, int xmax) const
    {
        int dx = 0, k = 0;
@ -1612,17 +1612,11 @@ struct HResizeLinearVecU8_X4
        }
        else if(cn == 3)
        {
-            int len0 = xmax - cn;
+            /* Peek at the last x offset to find the maximal s offset.  We know the loop
-
+               will terminate prior to value which may be 1 or more elements prior to the
-            /* This may need to trim 1 or more extra units depending on the amount of
+               final valid offset. xofs[] is constucted to be an array of increasingly
-               scaling. Test until we find the first value which we know cannot overrun. */
+               large offsets (i.e xofs[x] <= xofs[x+1] for x < xmax). */
-            while (len0 >= cn &&
+            int smax = xofs[dmax-cn];
                xofs[len0 - cn] + cn >= smax - cn  // check access: v_load_expand_q(S+xofs[dx]+cn)
            )
            {
                len0 -= cn;
            }
            CV_DbgAssert(len0 <= 0 || len0 >= cn);
            for( ; k <= (count - 2); k+=2 )
            {
@ -1631,7 +1625,7 @@ struct HResizeLinearVecU8_X4
                const uchar *S1 = src[k+1];
                int *D1 = dst[k+1];
-                for( dx = 0; dx < len0; dx += cn )
+                for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                {
                    v_int16x8 a = v_load(alpha+dx*2);
                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
@ -1642,12 +1636,14 @@ struct HResizeLinearVecU8_X4
            {
                const uchar *S = src[k];
                int *D = dst[k];
-                for( dx = 0; dx < len0; dx += cn )
+                for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                {
                    v_int16x8 a = v_load(alpha+dx*2);
                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
                }
            }
            /* Debug check to ensure truthiness that we never vector the final value. */
            CV_DbgAssert(dx < dmax);
        }
        else if(cn == 4)
        {