Merge pull request #7087 from terfendail/master

Fix for incorrect calcOpticalFlowPyrLK result evaluation with ARM NEON
pull/7135/head
Vadim Pisarevsky 8 years ago committed by GitHub
commit 66e94467dc
  1. 46
      modules/video/src/lkpyramid.cpp

@ -294,7 +294,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
#if CV_NEON
int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
float CV_DECL_ALIGNED(16) nA11[] = { 0, 0, 0, 0 }, nA12[] = { 0, 0, 0, 0 }, nA22[] = { 0, 0, 0, 0 };
const int shifter1 = -(W_BITS - 5); //negative so it shifts right
const int shifter2 = -(W_BITS);
@ -406,19 +406,19 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
q6 = vaddq_s32(q6, q8);
q7 = vmull_s16(d4d5.val[0], d28);
int32x4_t nq0 = vmull_s16(d4d5.val[1], d28);
int32x4_t q14 = vmull_s16(d4d5.val[1], d28);
q8 = vmull_s16(d6d7.val[0], d29);
int32x4_t q15 = vmull_s16(d6d7.val[1], d29);
q7 = vaddq_s32(q7, q8);
nq0 = vaddq_s32(nq0, q15);
q14 = vaddq_s32(q14, q15);
q4 = vaddq_s32(q4, q7);
q6 = vaddq_s32(q6, nq0);
q6 = vaddq_s32(q6, q14);
int32x4_t nq1 = vld1q_s32(nA12);
int32x4_t nq2 = vld1q_s32(nA22);
nq0 = vld1q_s32(nA11);
float32x4_t nq0 = vld1q_f32(nA11);
float32x4_t nq1 = vld1q_f32(nA12);
float32x4_t nq2 = vld1q_f32(nA22);
q4 = vqrshlq_s32(q4, q12);
q6 = vqrshlq_s32(q6, q12);
@ -427,13 +427,13 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
q8 = vmulq_s32(q4, q6);
q15 = vmulq_s32(q6, q6);
nq0 = vaddq_s32(nq0, q7);
nq1 = vaddq_s32(nq1, q8);
nq2 = vaddq_s32(nq2, q15);
nq0 = vaddq_f32(nq0, vreinterpretq_f32_s32(q7));
nq1 = vaddq_f32(nq1, vreinterpretq_f32_s32(q8));
nq2 = vaddq_f32(nq2, vreinterpretq_f32_s32(q15));
vst1q_s32(nA11, nq0);
vst1q_s32(nA12, nq1);
vst1q_s32(nA22, nq2);
vst1q_f32(nA11, nq0);
vst1q_f32(nA12, nq1);
vst1q_f32(nA22, nq2);
int16x4_t d8 = vmovn_s32(q4);
int16x4_t d12 = vmovn_s32(q6);
@ -474,9 +474,9 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
#endif
#if CV_NEON
iA11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
iA12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
iA22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);
iA11 += nA11[0] + nA11[1] + nA11[2] + nA11[3];
iA12 += nA12[0] + nA12[1] + nA12[2] + nA12[3];
iA22 += nA22[0] + nA22[1] + nA22[2] + nA22[3];
#endif
A11 = iA11*FLT_SCALE;
@ -530,7 +530,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
#endif
#if CV_NEON
int CV_DECL_ALIGNED(16) nB1[] = {0,0,0,0}, nB2[] = {0,0,0,0};
float CV_DECL_ALIGNED(16) nB1[] = { 0,0,0,0 }, nB2[] = { 0,0,0,0 };
const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00);
const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01);
@ -621,8 +621,8 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
nq5 = vqrshlq_s32(nq5, q11);
int16x8x2_t q0q1 = vld2q_s16(dIptr);
nq11 = vld1q_s32(nB1);
int32x4_t nq15 = vld1q_s32(nB2);
float32x4_t nB1v = vld1q_f32(nB1);
float32x4_t nB2v = vld1q_f32(nB2);
nq4 = vsubq_s32(nq4, nq6);
nq5 = vsubq_s32(nq5, nq8);
@ -642,11 +642,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
nq9 = vaddq_s32(nq9, nq10);
nq4 = vaddq_s32(nq4, nq5);
nq11 = vaddq_s32(nq11, nq9);
nq15 = vaddq_s32(nq15, nq4);
nB1v = vaddq_f32(nB1v, vreinterpretq_f32_s32(nq9));
nB2v = vaddq_f32(nB2v, vreinterpretq_f32_s32(nq4));
vst1q_s32(nB1, nq11);
vst1q_s32(nB2, nq15);
vst1q_f32(nB1, nB1v);
vst1q_f32(nB2, nB2v);
}
#endif

Loading…
Cancel
Save