diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index 378bab3087..e98b9c2e23 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, Mat Dy(1, width, CV_32F, dbuf + width); Mat Mag(1, width, CV_32F, dbuf + width*2); Mat Angle(1, width, CV_32F, dbuf + width*3); +#if CV_SIMD128 + int widthP2 = width+2; + AutoBuffer _lutBuf(9*widthP2); + float* const lutBuf = _lutBuf.data(); +#endif if (cn == 3) { @@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, xmap += 1; } +#if CV_SIMD128 + typedef const uchar* const T; + float *lutPrev, *lutCurr, *lutNext; + { + y = 0; + const uchar* imgPtr = img.ptr(ymap[y]); + const uchar* prevPtr = img.data + img.step*ymap[y-1]; + + lutPrev = lutBuf+widthP2*0; + lutCurr = lutBuf+widthP2*3; + + { + int x0 = xmap[-1], x1 = xmap[0]; + T p02 = imgPtr + x0, p12 = imgPtr + x1; + + lutPrev[0+widthP2*0] = lut[prevPtr[x0+0]]; + lutPrev[0+widthP2*1] = lut[prevPtr[x0+1]]; + lutPrev[0+widthP2*2] = lut[prevPtr[x0+2]]; + lutCurr[0+widthP2*0] = lut[p02[0]]; lutCurr[1+widthP2*0] = lut[p12[0]]; + lutCurr[0+widthP2*1] = lut[p02[1]]; lutCurr[1+widthP2*1] = lut[p12[1]]; + lutCurr[0+widthP2*2] = lut[p02[2]]; lutCurr[1+widthP2*2] = lut[p12[2]]; + } + + for( x = 0; x <= width - 4; x += 4 ) + { + int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; + T p02 = imgPtr + xmap[x+1]; + T p12 = imgPtr + xmap[x+2]; + T p22 = imgPtr + xmap[x+3]; + T p32 = imgPtr + xmap[x+4]; + + v_float32x4 _dx00 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]); + v_float32x4 _dx10 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]); + v_float32x4 _dx20 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]); + + v_store(lutCurr+x+widthP2*0+2, _dx00); + v_store(lutCurr+x+widthP2*1+2, _dx10); + v_store(lutCurr+x+widthP2*2+2, _dx20); + + v_float32x4 _dy00 = v_float32x4(lut[prevPtr[x0+0]], lut[prevPtr[x1+0]], lut[prevPtr[x2+0]], lut[prevPtr[x3+0]]); + v_float32x4 _dy10 = v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); + v_float32x4 _dy20 = v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); + + v_store(lutPrev+x+widthP2*0+1, _dy00); + v_store(lutPrev+x+widthP2*1+1, _dy10); + v_store(lutPrev+x+widthP2*2+1, _dy20); + } + { + int x0 = xmap[x]; + + lutPrev[x+widthP2*0+1] = lut[prevPtr[x0+0]]; + lutPrev[x+widthP2*1+1] = lut[prevPtr[x0+1]]; + lutPrev[x+widthP2*2+1] = lut[prevPtr[x0+2]]; + } + } +#endif + float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI); for( y = 0; y < gradsize.height; y++ ) { @@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, { x = 0; #if CV_SIMD128 + int yMod = y%3; + + // Circular lut history buffer + if (yMod == 0) + { + lutPrev = lutBuf+widthP2*0; + lutCurr = lutBuf+widthP2*3; + lutNext = lutBuf+widthP2*6; + } + else if (yMod == 1) + { + lutPrev = lutBuf+widthP2*3; + lutCurr = lutBuf+widthP2*6; + lutNext = lutBuf+widthP2*0; + } + else + { + lutPrev = lutBuf+widthP2*6; + lutCurr = lutBuf+widthP2*0; + lutNext = lutBuf+widthP2*3; + } + + { + int x0 = xmap[-1]; + + lutNext[0+widthP2*0] = lut[nextPtr[x0+0]]; + lutNext[0+widthP2*1] = lut[nextPtr[x0+1]]; + lutNext[0+widthP2*2] = lut[nextPtr[x0+2]]; + } for( ; x <= width - 4; x += 4 ) { int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; - typedef const uchar* const T; - T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1]; - T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x]; - T p22 = imgPtr + xmap[x+3], p20 = p02; - T p32 = imgPtr + xmap[x+4], p30 = p12; - - v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) - - v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]); - v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) - - v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]); - v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) - - v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]); - - v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) - - v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]); - v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) - - v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); - v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) - - v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); + + v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0); + v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1); + v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2); + + v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]); + v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1); + + v_store(lutNext+x+widthP2*0+1, _dy00); + + v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]); + v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1); + + v_store(lutNext+x+widthP2*1+1, _dy10); + + v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]); + v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1); + + v_store(lutNext+x+widthP2*2+1, _dy20); v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0); v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1); @@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle, v_store(dbuf + x, _dx2); v_store(dbuf + x + width, _dy2); } + { + int x0 = xmap[x]; + + lutNext[x+widthP2*0+1] = lut[nextPtr[x0+0]]; + lutNext[x+widthP2*1+1] = lut[nextPtr[x0+1]]; + lutNext[x+widthP2*2+1] = lut[nextPtr[x0+2]]; + } #endif for( ; x < width; x++ ) {