Merge pull request #7087 from terfendail/master

Fix for incorrect calcOpticalFlowPyrLK result evaluation with ARM NEON
9 years ago · 66e94467dc
parent 2a4252e117 d8dc6caf09
commit 66e94467dc
1 changed files with 23 additions and 23 deletions
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@ -294,7 +294,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const

 #if CV_NEON

-        int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
+        float CV_DECL_ALIGNED(16) nA11[] = { 0, 0, 0, 0 }, nA12[] = { 0, 0, 0, 0 }, nA22[] = { 0, 0, 0, 0 };
        const int shifter1 = -(W_BITS - 5); //negative so it shifts right
        const int shifter2 = -(W_BITS);

@ -406,19 +406,19 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                q6 = vaddq_s32(q6, q8);

                q7 = vmull_s16(d4d5.val[0], d28);
-                int32x4_t nq0 = vmull_s16(d4d5.val[1], d28);
+                int32x4_t q14 = vmull_s16(d4d5.val[1], d28);
                q8 = vmull_s16(d6d7.val[0], d29);
                int32x4_t q15 = vmull_s16(d6d7.val[1], d29);

                q7 = vaddq_s32(q7, q8);
-                nq0 = vaddq_s32(nq0, q15);
+                q14 = vaddq_s32(q14, q15);

                q4 = vaddq_s32(q4, q7);
-                q6 = vaddq_s32(q6, nq0);
+                q6 = vaddq_s32(q6, q14);

-                int32x4_t nq1 = vld1q_s32(nA12);
-                int32x4_t nq2 = vld1q_s32(nA22);
-                nq0 = vld1q_s32(nA11);
+                float32x4_t nq0 = vld1q_f32(nA11);
+                float32x4_t nq1 = vld1q_f32(nA12);
+                float32x4_t nq2 = vld1q_f32(nA22);

                q4 = vqrshlq_s32(q4, q12);
                q6 = vqrshlq_s32(q6, q12);
@ -427,13 +427,13 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                q8 = vmulq_s32(q4, q6);
                q15 = vmulq_s32(q6, q6);

-                nq0 = vaddq_s32(nq0, q7);
-                nq1 = vaddq_s32(nq1, q8);
-                nq2 = vaddq_s32(nq2, q15);
+                nq0 = vaddq_f32(nq0, vreinterpretq_f32_s32(q7));
+                nq1 = vaddq_f32(nq1, vreinterpretq_f32_s32(q8));
+                nq2 = vaddq_f32(nq2, vreinterpretq_f32_s32(q15));

-                vst1q_s32(nA11, nq0);
-                vst1q_s32(nA12, nq1);
-                vst1q_s32(nA22, nq2);
+                vst1q_f32(nA11, nq0);
+                vst1q_f32(nA12, nq1);
+                vst1q_f32(nA22, nq2);

                int16x4_t d8 = vmovn_s32(q4);
                int16x4_t d12 = vmovn_s32(q6);
@ -474,9 +474,9 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 #endif

 #if CV_NEON
-        iA11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
-        iA12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
-        iA22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);
+        iA11 += nA11[0] + nA11[1] + nA11[2] + nA11[3];
+        iA12 += nA12[0] + nA12[1] + nA12[2] + nA12[3];
+        iA22 += nA22[0] + nA22[1] + nA22[2] + nA22[3];
 #endif

        A11 = iA11*FLT_SCALE;
@ -530,7 +530,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 #endif

 #if CV_NEON
-            int CV_DECL_ALIGNED(16) nB1[] = {0,0,0,0}, nB2[] = {0,0,0,0};
+            float CV_DECL_ALIGNED(16) nB1[] = { 0,0,0,0 }, nB2[] = { 0,0,0,0 };

            const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00);
            const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01);
@ -621,8 +621,8 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                    nq5 = vqrshlq_s32(nq5, q11);

                    int16x8x2_t q0q1 = vld2q_s16(dIptr);
-                    nq11 = vld1q_s32(nB1);
-                    int32x4_t nq15 = vld1q_s32(nB2);
+                    float32x4_t nB1v = vld1q_f32(nB1);
+                    float32x4_t nB2v = vld1q_f32(nB2);

                    nq4 = vsubq_s32(nq4, nq6);
                    nq5 = vsubq_s32(nq5, nq8);
@ -642,11 +642,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                    nq9 = vaddq_s32(nq9, nq10);
                    nq4 = vaddq_s32(nq4, nq5);

-                    nq11 = vaddq_s32(nq11, nq9);
-                    nq15 = vaddq_s32(nq15, nq4);
+                    nB1v = vaddq_f32(nB1v, vreinterpretq_f32_s32(nq9));
+                    nB2v = vaddq_f32(nB2v, vreinterpretq_f32_s32(nq4));

-                    vst1q_s32(nB1, nq11);
-                    vst1q_s32(nB2, nq15);
+                    vst1q_f32(nB1, nB1v);
+                    vst1q_f32(nB2, nB2v);
                }
 #endif