imgproc(resize): improve 8u3 HResize vector exit calc

Actually, we can do this in constant time. xofs always
contains same or increasing offset values. We can instead
find the most extreme value used and never attempt to load it.

Similarly, we can note for all dx >= 0 and dx < (dwidth - cn)
where xofs[dx] + cn < xofs[dwidth-cn] implies dx < (dwidth - cn).

Thus, we can use this to control our loop termination optimally.

This fixes #16137 with little or no performance impact. I have
also added a debug check as a sanity check.
pull/16146/head
Paul E. Murphy 5 years ago
parent 40ac72a8f1
commit c1cdb2416a
  1. 24
      modules/imgproc/src/resize.cpp

@ -1526,7 +1526,7 @@ struct HResizeLinearVec_X4
struct HResizeLinearVecU8_X4 struct HResizeLinearVecU8_X4
{ {
int operator()(const uchar** src, int** dst, int count, const int* xofs, int operator()(const uchar** src, int** dst, int count, const int* xofs,
const short* alpha/*[xmax]*/, int smax, int /*dmax*/, int cn, int /*xmin*/, int xmax) const const short* alpha/*[xmax]*/, int /*smax*/, int dmax, int cn, int /*xmin*/, int xmax) const
{ {
int dx = 0, k = 0; int dx = 0, k = 0;
@ -1612,17 +1612,11 @@ struct HResizeLinearVecU8_X4
} }
else if(cn == 3) else if(cn == 3)
{ {
int len0 = xmax - cn; /* Peek at the last x offset to find the maximal s offset. We know the loop
will terminate prior to value which may be 1 or more elements prior to the
/* This may need to trim 1 or more extra units depending on the amount of final valid offset. xofs[] is constucted to be an array of increasingly
scaling. Test until we find the first value which we know cannot overrun. */ large offsets (i.e xofs[x] <= xofs[x+1] for x < xmax). */
while (len0 >= cn && int smax = xofs[dmax-cn];
xofs[len0 - cn] + cn >= smax - cn // check access: v_load_expand_q(S+xofs[dx]+cn)
)
{
len0 -= cn;
}
CV_DbgAssert(len0 <= 0 || len0 >= cn);
for( ; k <= (count - 2); k+=2 ) for( ; k <= (count - 2); k+=2 )
{ {
@ -1631,7 +1625,7 @@ struct HResizeLinearVecU8_X4
const uchar *S1 = src[k+1]; const uchar *S1 = src[k+1];
int *D1 = dst[k+1]; int *D1 = dst[k+1];
for( dx = 0; dx < len0; dx += cn ) for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
{ {
v_int16x8 a = v_load(alpha+dx*2); v_int16x8 a = v_load(alpha+dx*2);
v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a)); v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
@ -1642,12 +1636,14 @@ struct HResizeLinearVecU8_X4
{ {
const uchar *S = src[k]; const uchar *S = src[k];
int *D = dst[k]; int *D = dst[k];
for( dx = 0; dx < len0; dx += cn ) for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
{ {
v_int16x8 a = v_load(alpha+dx*2); v_int16x8 a = v_load(alpha+dx*2);
v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a)); v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
} }
} }
/* Debug check to ensure truthiness that we never vector the final value. */
CV_DbgAssert(dx < dmax);
} }
else if(cn == 4) else if(cn == 4)
{ {

Loading…
Cancel
Save