diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index eb6fdc59f7..714237ae98 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -2993,7 +2993,11 @@ int Kernel::set(int i, const KernelArg& arg) if( !p || !p->handle ) return -1; if (i < 0) + { + CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index", + p->name.c_str(), (int)i)); return i; + } if( i == 0 ) p->cleanupUMats(); cl_int status = 0; @@ -3002,10 +3006,19 @@ int Kernel::set(int i, const KernelArg& arg) AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast(0)) | ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast(0)); bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0; + if (ptronly && arg.m->empty()) + { + cl_mem h_null = (cl_mem)NULL; + status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null); + CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str()); + return i + 1; + } cl_mem h = (cl_mem)arg.m->handle(accessFlags); if (!h) { + CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)", + p->name.c_str(), (int)i, (int)arg.flags, arg.m)); p->release(); p = 0; return -1; diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 772902ca01..ca7e7112c5 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -138,9 +138,12 @@ public: UMat& bnorm_weight = umat_scale; UMat& bnorm_bias = umat_shift; + const unsigned LOCAL_SIZE = 128; bool use_half = (inputs[0].depth() == CV_16S); - String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float", - use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4"); + String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float", + use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4", + LOCAL_SIZE + ); int splitDim = (acrossChannels) ? 1 : 2; for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) @@ -155,8 +158,8 @@ public: float alpha = 1.0f / s[1]; String buildopt = "-DNUM=4" + opts; - ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt); - size_t localsize[] = { 128 }; + ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE"); + size_t localsize[] = { LOCAL_SIZE }; size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] }; int argId = 0; @@ -165,7 +168,6 @@ public: k.set(argId++, alpha); k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat)); k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat)); - k.set(argId++, NULL, localsize[0] * sizeof(cl_float4)); bool ret = k.run(1, globalsize, localsize, false); if (!ret) return false; @@ -173,7 +175,7 @@ public: buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "", (fuse_relu) ? "-DFUSE_RELU" : ""); - ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt); + ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE"); argId = 0; k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat)); @@ -185,7 +187,6 @@ public: k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); - k1.set(argId++, NULL, localsize[0] * sizeof(cl_float4)); ret = k1.run(1, globalsize, localsize, false); if (!ret) return false; @@ -243,7 +244,7 @@ public: if (normVariance) { String kname = format("calc_mean%d", number); - ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt); + ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN"); if (kernel.empty()) return false; @@ -263,7 +264,7 @@ public: } String kname = format("mvn%d", number); - buildopt += format("%s%s%s", (normVariance) ? " -DNORM_VARIANCE" : "", + buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "", (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "", (fuse_relu) ? " -DFUSE_RELU" : ""); ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt); diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl index ffc81a8704..1bec5a0dde 100644 --- a/modules/dnn/src/opencl/mvn.cl +++ b/modules/dnn/src/opencl/mvn.cl @@ -74,6 +74,8 @@ #define MVN_FUSE mvn_fuse1 #endif +#ifdef KERNEL_MEAN + __kernel void CALC_MEAN(__global const Dtype* src, const int rows, const int cols, @@ -94,6 +96,8 @@ __kernel void CALC_MEAN(__global const Dtype* src, store(dst_vec, dst, index); } +#elif defined KERNEL_MVN + __kernel void MVN(__global const Dtype* src, const int rows, const int cols, @@ -140,12 +144,13 @@ __kernel void MVN(__global const Dtype* src, store(dst_vec, dst, index); } +#elif defined KERNEL_MEAN_FUSE + __kernel void MEAN_FUSE(__global const T * A, unsigned int A_col_size, float alpha, __global T4 * mean, - __global Dtype * tmp, - __local Dtype4 * work) + __global Dtype * tmp) { unsigned int row_gid = get_group_id(0); unsigned int lid = get_local_id(0); @@ -168,15 +173,16 @@ __kernel void MEAN_FUSE(__global const T * A, dot2 += convert_float4(a2); dot3 += convert_float4(a3); - i += get_local_size(0); + i += LOCAL_SIZE; } + __local Dtype4 work[LOCAL_SIZE]; work[lid].s0 = dot(dot0, b0); work[lid].s1 = dot(dot1, b0); work[lid].s2 = dot(dot2, b0); work[lid].s3 = dot(dot3, b0); - for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) + for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1) { barrier(CLK_LOCAL_MEM_FENCE); if(lid < stride) @@ -212,10 +218,12 @@ __kernel void MEAN_FUSE(__global const T * A, vstore4(dot2, i, dst0_read + 2 * A_col_size); vstore4(dot3, i, dst0_read + 3 * A_col_size); - i += get_local_size(0); + i += LOCAL_SIZE; } } +#elif defined KERNEL_MVN_FUSE + __kernel void MVN_FUSE(__global const Dtype * tmp, __global const T * A, __global const T4 * mean, @@ -225,8 +233,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, const float relu_slope, __global const Dtype4 * bnorm_weight, __global const Dtype4 * bnorm_bias, - __global T * B, - __local Dtype4 * work) + __global T * B) { unsigned int row_gid = get_group_id(0); unsigned int lid = get_local_id(0); @@ -250,15 +257,16 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, dot2 += a2; dot3 += a3; - i += get_local_size(0); + i += LOCAL_SIZE; } + __local Dtype4 work[LOCAL_SIZE]; work[lid].s0 = dot(dot0, b0); work[lid].s1 = dot(dot1, b0); work[lid].s2 = dot(dot2, b0); work[lid].s3 = dot(dot3, b0); - for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) + for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1) { barrier(CLK_LOCAL_MEM_FENCE); if(lid < stride) @@ -314,6 +322,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp, vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size); vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size); - i += get_local_size(0); + i += LOCAL_SIZE; } } + +#else +#error "Configuration error!" +#endif diff --git a/modules/imgproc/src/color_yuv.cpp b/modules/imgproc/src/color_yuv.cpp index c596e40a53..7d731378e2 100644 --- a/modules/imgproc/src/color_yuv.cpp +++ b/modules/imgproc/src/color_yuv.cpp @@ -963,22 +963,22 @@ struct YCrCb2RGB_i ///////////////////////////////////// YUV420 -> RGB ///////////////////////////////////// -const int ITUR_BT_601_CY = 1220542; -const int ITUR_BT_601_CUB = 2116026; -const int ITUR_BT_601_CUG = -409993; -const int ITUR_BT_601_CVG = -852492; -const int ITUR_BT_601_CVR = 1673527; -const int ITUR_BT_601_SHIFT = 20; +static const int ITUR_BT_601_CY = 1220542; +static const int ITUR_BT_601_CUB = 2116026; +static const int ITUR_BT_601_CUG = -409993; +static const int ITUR_BT_601_CVG = -852492; +static const int ITUR_BT_601_CVR = 1673527; +static const int ITUR_BT_601_SHIFT = 20; // Coefficients for RGB to YUV420p conversion -const int ITUR_BT_601_CRY = 269484; -const int ITUR_BT_601_CGY = 528482; -const int ITUR_BT_601_CBY = 102760; -const int ITUR_BT_601_CRU = -155188; -const int ITUR_BT_601_CGU = -305135; -const int ITUR_BT_601_CBU = 460324; -const int ITUR_BT_601_CGV = -385875; -const int ITUR_BT_601_CBV = -74448; +static const int ITUR_BT_601_CRY = 269484; +static const int ITUR_BT_601_CGY = 528482; +static const int ITUR_BT_601_CBY = 102760; +static const int ITUR_BT_601_CRU = -155188; +static const int ITUR_BT_601_CGU = -305135; +static const int ITUR_BT_601_CBU = 460324; +static const int ITUR_BT_601_CGV = -385875; +static const int ITUR_BT_601_CBV = -74448; //R = 1.164(Y - 16) + 1.596(V - 128) //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) @@ -988,49 +988,146 @@ const int ITUR_BT_601_CBV = -74448; //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 //B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 +static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) +{ + int uu, vv; + uu = int(u) - 128; + vv = int(v) - 128; + + ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv; + guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu; + buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; +} + +static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, + v_int32 (&ruv)[4], + v_int32 (&guv)[4], + v_int32 (&buv)[4]) +{ + v_uint8 v128 = vx_setall_u8(128); + v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); + v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128)); + + v_int16 uu0, uu1, vv0, vv1; + v_expand(su, uu0, uu1); + v_expand(sv, vv0, vv1); + v_int32 uu[4], vv[4]; + v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]); + v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]); + + v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1)); + v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR); + v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG); + v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG); + v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB); + + for (int k = 0; k < 4; k++) + { + ruv[k] = vshift + vr * vv[k]; + guv[k] = vshift + vg * vv[k] + ug * uu[k]; + buv[k] = vshift + ub * uu[k]; + } +} + +static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, + uchar& r, uchar& g, uchar& b, uchar& a) +{ + int yy = int(vy); + int y = std::max(0, yy - 16) * ITUR_BT_601_CY; + r = saturate_cast((y + ruv) >> ITUR_BT_601_SHIFT); + g = saturate_cast((y + guv) >> ITUR_BT_601_SHIFT); + b = saturate_cast((y + buv) >> ITUR_BT_601_SHIFT); + a = uchar(0xff); +} + +static inline void yRGBuvToRGBA(const v_uint8& vy, + const v_int32 (&ruv)[4], + const v_int32 (&guv)[4], + const v_int32 (&buv)[4], + v_uint8& rr, v_uint8& gg, v_uint8& bb) +{ + v_uint8 v16 = vx_setall_u8(16); + v_uint8 posY = vy - v16; + v_uint16 yy0, yy1; + v_expand(posY, yy0, yy1); + v_int32 yy[4]; + v_int32 yy00, yy01, yy10, yy11; + v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]); + v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]); + + v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY); + + v_int32 y[4], r[4], g[4], b[4]; + for(int k = 0; k < 4; k++) + { + y[k] = yy[k]*vcy; + r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT; + g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT; + b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT; + } + + v_int16 r0, r1, g0, g1, b0, b1; + r0 = v_pack(r[0], r[1]); + r1 = v_pack(r[2], r[3]); + g0 = v_pack(g[0], g[1]); + g1 = v_pack(g[2], g[3]); + b0 = v_pack(b[0], b[1]); + b1 = v_pack(b[2], b[3]); + + rr = v_pack_u(r0, r1); + gg = v_pack_u(g0, g1); + bb = v_pack_u(b0, b1); +} + template -static inline void cvtYuv42xxp2RGB8(int u, int v, int vy01, int vy11, int vy02, int vy12, +static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, + const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12, uchar* row1, uchar* row2) { - u = u - 128; - v = v - 128; + int ruv, guv, buv; + uvToRGBuv(u, v, ruv, guv, buv); - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; + uchar r00, g00, b00, a00; + uchar r01, g01, b01, a01; - int y00 = std::max(0, vy01 - 16) * ITUR_BT_601_CY; - row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); + yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00); + yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01); + + row1[2-bIdx] = r00; + row1[1] = g00; + row1[bIdx] = b00; if(dcn == 4) - row1[3] = uchar(0xff); + row1[3] = a00; - int y01 = std::max(0, vy11 - 16) * ITUR_BT_601_CY; - row1[dcn+2-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row1[dcn+1] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row1[dcn+0+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); + row1[dcn+2-bIdx] = r01; + row1[dcn+1] = g01; + row1[dcn+0+bIdx] = b01; if(dcn == 4) - row1[7] = uchar(0xff); + row1[7] = a01; if(is420) { - int y10 = std::max(0, vy02 - 16) * ITUR_BT_601_CY; - row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); - row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); - row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); + uchar r10, g10, b10, a10; + uchar r11, g11, b11, a11; + + yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10); + yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11); + + row2[2-bIdx] = r10; + row2[1] = g10; + row2[bIdx] = b10; if(dcn == 4) - row2[3] = uchar(0xff); + row2[3] = a10; - int y11 = std::max(0, vy12 - 16) * ITUR_BT_601_CY; - row2[dcn+2-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); - row2[dcn+1] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); - row2[dcn+0+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); + row2[dcn+2-bIdx] = r11; + row2[dcn+1] = g11; + row2[dcn+0+bIdx] = b11; if(dcn == 4) - row2[7] = uchar(0xff); + row2[7] = a11; } } +// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4 template struct YUV420sp2RGB8Invoker : ParallelLoopBody { @@ -1056,15 +1153,80 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody uchar* row2 = dst_data + dst_step * (j + 1); const uchar* y2 = y1 + stride; - for (int i = 0; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for( ; i <= width - 2*vsize; + i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) { - int u = int(uv[i + 0 + uIdx]); - int v = int(uv[i + 1 - uIdx]); + v_uint8 u, v; + v_load_deinterleave(uv + i, u, v); + + if(uIdx) + { + swap(u, v); + } + + v_uint8 vy[4]; + v_load_deinterleave(y1 + i, vy[0], vy[1]); + v_load_deinterleave(y2 + i, vy[2], vy[3]); + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); - int vy01 = int(y1[i]); - int vy11 = int(y1[i + 1]); - int vy02 = int(y2[i]); - int vy12 = int(y2[i + 1]); + v_uint8 r[4], g[4], b[4]; + + for(int k = 0; k < 4; k++) + { + yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); + } + + if(bIdx) + { + for(int k = 0; k < 4; k++) + swap(r[k], b[k]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1, r1_0, r1_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_zip(r[2], r[3], r1_0, r1_1); + v_uint8 g0_0, g0_1, g1_0, g1_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_zip(g[2], g[3], g1_0, g1_1); + v_uint8 b0_0, b0_1, b1_0, b1_1; + v_zip(b[0], b[1], b0_0, b0_1); + v_zip(b[2], b[3], b1_0, b1_1); + + if(dcn == 4) + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); + v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); + } + else //dcn == 3 + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); + v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); + } + } + vx_cleanup(); +#endif + for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) + { + uchar u = uv[i + 0 + uIdx]; + uchar v = uv[i + 1 - uIdx]; + + uchar vy01 = y1[i]; + uchar vy11 = y1[i + 1]; + uchar vy02 = y2[i]; + uchar vy12 = y2[i + 1]; cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); } @@ -1108,16 +1270,77 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody uchar* row1 = dst_data + dst_step * j; uchar* row2 = dst_data + dst_step * (j + 1); const uchar* y2 = y1 + stride; + int i = 0; + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for( ; i <= width/2 - vsize; + i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) + { + v_uint8 u, v; + u = vx_load(u1 + i); + v = vx_load(v1 + i); + + v_uint8 vy[4]; + v_load_deinterleave(y1 + 2*i, vy[0], vy[1]); + v_load_deinterleave(y2 + 2*i, vy[2], vy[3]); + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); + + v_uint8 r[4], g[4], b[4]; + + for(int k = 0; k < 4; k++) + { + yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); + } + + if(bIdx) + { + for(int k = 0; k < 4; k++) + swap(r[k], b[k]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1, r1_0, r1_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_zip(r[2], r[3], r1_0, r1_1); + v_uint8 g0_0, g0_1, g1_0, g1_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_zip(g[2], g[3], g1_0, g1_1); + v_uint8 b0_0, b0_1, b1_0, b1_1; + v_zip(b[0], b[1], b0_0, b0_1); + v_zip(b[2], b[3], b1_0, b1_1); + + if(dcn == 4) + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); + + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); + v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); + } + else //dcn == 3 + { + v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); - for (int i = 0; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) + v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); + v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); + } + } + vx_cleanup(); +#endif + for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) { - int u = int(u1[i]); - int v = int(v1[i]); + uchar u = u1[i]; + uchar v = v1[i]; - int vy01 = int(y1[2 * i]); - int vy11 = int(y1[2 * i + 1]); - int vy02 = int(y2[2 * i]); - int vy12 = int(y2[2 * i + 1]); + uchar vy01 = y1[2 * i]; + uchar vy11 = y1[2 * i + 1]; + uchar vy02 = y2[2 * i]; + uchar vy12 = y2[2 * i + 1]; cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); } @@ -1150,106 +1373,258 @@ inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int ///////////////////////////////////// RGB -> YUV420p ///////////////////////////////////// +static inline uchar rgbToY42x(uchar r, uchar g, uchar b) +{ + const int shifted16 = (16 << ITUR_BT_601_SHIFT); + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; + + return saturate_cast(yy >> ITUR_BT_601_SHIFT); +} + +static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) +{ + const int shifted16 = (16 << ITUR_BT_601_SHIFT); + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); + + v_uint32 rq[4], gq[4], bq[4]; + v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]); + v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]); + v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]); + + v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY); + v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16); + + v_uint32 y[4]; + for(int k = 0; k < 4; k++) + { + y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT; + } + + v_uint16 y0, y1; + y0 = v_pack(y[0], y[1]); + y1 = v_pack(y[2], y[3]); + + return v_pack(y0, y1); +} + +static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) +{ + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + const int shifted128 = (128 << ITUR_BT_601_SHIFT); + int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; + int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; + + u = saturate_cast(uu >> ITUR_BT_601_SHIFT); + v = saturate_cast(vv >> ITUR_BT_601_SHIFT); +} + +static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, + const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) +{ + // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..] + v_int16 vlowByte = vx_setall_s16(0x00ff); + v_int16 rd0, rd1, gd0, gd1, bd0, bd1; + rd0 = v_reinterpret_as_s16(r0) & vlowByte; + rd1 = v_reinterpret_as_s16(r1) & vlowByte; + gd0 = v_reinterpret_as_s16(g0) & vlowByte; + gd1 = v_reinterpret_as_s16(g1) & vlowByte; + bd0 = v_reinterpret_as_s16(b0) & vlowByte; + bd1 = v_reinterpret_as_s16(b1) & vlowByte; + + v_int32 rq[4], gq[4], bq[4]; + v_expand(rd0, rq[0], rq[1]); + v_expand(rd1, rq[2], rq[3]); + v_expand(gd0, gq[0], gq[1]); + v_expand(gd1, gq[2], gq[3]); + v_expand(bd0, bq[0], bq[1]); + v_expand(bd1, bq[2], bq[3]); + + const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); + const int shifted128 = (128 << ITUR_BT_601_SHIFT); + v_int32 shift = vx_setall_s32(halfShift + shifted128); + v_int32 ru, gu, bu, gv, bv; + ru = vx_setall_s32(ITUR_BT_601_CRU); + gu = vx_setall_s32(ITUR_BT_601_CGU); + gv = vx_setall_s32(ITUR_BT_601_CGV); + bu = vx_setall_s32(ITUR_BT_601_CBU); + bv = vx_setall_s32(ITUR_BT_601_CBV); + + v_int32 uq[4], vq[4]; + for(int k = 0; k < 4; k++) + { + uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT; + vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT; + } + + v_int16 u0, u1, v0, v1; + u0 = v_pack(uq[0], uq[1]); + u1 = v_pack(uq[2], uq[3]); + v0 = v_pack(vq[0], vq[1]); + v1 = v_pack(vq[2], vq[3]); + + u = v_pack_u(u0, u1); + v = v_pack_u(v0, v1); +} + + struct RGB8toYUV420pInvoker: public ParallelLoopBody { - RGB8toYUV420pInvoker(const uchar * _src_data, size_t _src_step, - uchar * _y_data, uchar * _uv_data, size_t _dst_step, - int _src_width, int _src_height, int _scn, bool swapBlue_, bool swapUV_, bool interleaved_) - : src_data(_src_data), src_step(_src_step), - y_data(_y_data), uv_data(_uv_data), dst_step(_dst_step), - src_width(_src_width), src_height(_src_height), - scn(_scn), swapBlue(swapBlue_), swapUV(swapUV_), interleaved(interleaved_) { } + RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep, + uchar * _yData, uchar * _uvData, size_t _dstStep, + int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave) + : srcData(_srcData), srcStep(_srcStep), + yData(_yData), uvData(_uvData), dstStep(_dstStep), + srcWidth(_srcWidth), srcHeight(_srcHeight), + srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { } void operator()(const Range& rowRange) const CV_OVERRIDE { - const int w = src_width; - const int h = src_height; - const int cn = scn; - for( int i = rowRange.start; i < rowRange.end; i++ ) + const int w = srcWidth; + const int h = srcHeight; + const int scn = srcCn; + const uchar* srcRow = (uchar*)0; + uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0; + for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++) { - const uchar* brow0 = src_data + src_step * (2 * i); - const uchar* grow0 = brow0 + 1; - const uchar* rrow0 = brow0 + 2; - const uchar* brow1 = src_data + src_step * (2 * i + 1); - const uchar* grow1 = brow1 + 1; - const uchar* rrow1 = brow1 + 2; - if (swapBlue) + srcRow = srcData + srcStep*sRow; + yRow = yData + dstStep * sRow; + bool evenRow = (sRow % 2) == 0; + if(evenRow) { - std::swap(brow0, rrow0); - std::swap(brow1, rrow1); + if (interleave) + { + uvRow = uvData + dstStep*(sRow/2); + } + else + { + uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2); + vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2); + } } + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; - uchar* y = y_data + dst_step * (2*i); - uchar* u; - uchar* v; - if (interleaved) - { - u = uv_data + dst_step * i; - v = uv_data + dst_step * i + 1; - } - else + for( ; i <= w/2 - vsize; + i += vsize) { - u = uv_data + dst_step * (i/2) + (i % 2) * (w/2); - v = uv_data + dst_step * ((i + h/2)/2) + ((i + h/2) % 2) * (w/2); - } + // processing (2*vsize) pixels at once + v_uint8 b0, b1, g0, g1, r0, r1, a0, a1; + if(scn == 4) + { + v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0); + v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1); + } + else // scn == 3 + { + v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0); + v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1); + } - if (swapUV) - { - std::swap(u, v); - } + if(swapBlue) + { + swap(b0, r0); swap(b1, r1); + } + + v_uint8 y0, y1; + + y0 = rgbToY42x(r0, g0, b0); + y1 = rgbToY42x(r1, g1, b1); - for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ ) + v_store(yRow + 2*i + 0*vsize, y0); + v_store(yRow + 2*i + 1*vsize, y1); + + if(evenRow) + { + v_uint8 u, v; + rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v); + + if(swapUV) + { + swap(u, v); + } + + if(interleave) + { + v_store_interleave(uvRow + 2*i, u, v); + } + else + { + v_store(uRow + i, u); + v_store(vRow + i, v); + } + } + } + vx_cleanup(); +#endif + // processing two pixels at once + for( ; i < w/2; i++) { - int r00 = rrow0[j]; int g00 = grow0[j]; int b00 = brow0[j]; - int r01 = rrow0[cn + j]; int g01 = grow0[cn + j]; int b01 = brow0[cn + j]; - int r10 = rrow1[j]; int g10 = grow1[j]; int b10 = brow1[j]; - int r11 = rrow1[cn + j]; int g11 = grow1[cn + j]; int b11 = brow1[cn + j]; - - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16; - int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16; - int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16; - int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16; - - y[2*k + 0] = saturate_cast(y00 >> ITUR_BT_601_SHIFT); - y[2*k + 1] = saturate_cast(y01 >> ITUR_BT_601_SHIFT); - y[2*k + dst_step + 0] = saturate_cast(y10 >> ITUR_BT_601_SHIFT); - y[2*k + dst_step + 1] = saturate_cast(y11 >> ITUR_BT_601_SHIFT); - - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128; - int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128; - - if (interleaved) + uchar b0, g0, r0; + uchar b1, g1, r1; + b0 = srcRow[(2*i+0)*scn + 0]; + g0 = srcRow[(2*i+0)*scn + 1]; + r0 = srcRow[(2*i+0)*scn + 2]; + b1 = srcRow[(2*i+1)*scn + 0]; + g1 = srcRow[(2*i+1)*scn + 1]; + r1 = srcRow[(2*i+1)*scn + 2]; + + if(swapBlue) { - u[k*2] = saturate_cast(u00 >> ITUR_BT_601_SHIFT); - v[k*2] = saturate_cast(v00 >> ITUR_BT_601_SHIFT); + swap(b0, r0); swap(b1, r1); } - else + + uchar y0 = rgbToY42x(r0, g0, b0); + uchar y1 = rgbToY42x(r1, g1, b1); + + yRow[2*i+0] = y0; + yRow[2*i+1] = y1; + + if(evenRow) { - u[k] = saturate_cast(u00 >> ITUR_BT_601_SHIFT); - v[k] = saturate_cast(v00 >> ITUR_BT_601_SHIFT); + uchar uu, vv; + rgbToUV42x(r0, g0, b0, uu, vv); + if(swapUV) + { + swap(uu, vv); + } + + if(interleave) + { + uvRow[2*i+0] = uu; + uvRow[2*i+1] = vv; + } + else + { + uRow[i] = uu; + vRow[i] = vv; + } } } } } - const uchar * src_data; - size_t src_step; - uchar *y_data, *uv_data; - size_t dst_step; - int src_width; - int src_height; - const int scn; + const uchar * srcData; + size_t srcStep; + uchar *yData, *uvData; + size_t dstStep; + int srcWidth; + int srcHeight; + const int srcCn; bool swapBlue; bool swapUV; - bool interleaved; + bool interleave; }; ///////////////////////////////////// YUV422 -> RGB ///////////////////////////////////// +// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4 template struct YUV422toRGB8Invoker : ParallelLoopBody { @@ -1269,6 +1644,10 @@ struct YUV422toRGB8Invoker : ParallelLoopBody int rangeBegin = range.start; int rangeEnd = range.end; + // [yIdx, uIdx] | [uidx, vidx]: + // 0, 0 | 1, 3 + // 0, 1 | 3, 1 + // 1, 0 | 0, 2 const int uidx = 1 - yIdx + uIdx * 2; const int vidx = (2 + uidx) % 4; const uchar* yuv_src = src_data + rangeBegin * src_step; @@ -1276,14 +1655,69 @@ struct YUV422toRGB8Invoker : ParallelLoopBody for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step) { uchar* row = dst_data + dst_step * j; + int i = 0; +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 a = vx_setall_u8(uchar(0xff)); + for(; i <= 2*width - 4*vsize; + i += 4*vsize, row += vsize*dcn*2) + { + v_uint8 u, v, vy[2]; + if(yIdx == 1) // UYVY + { + v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]); + } + else // YUYV or YVYU + { + v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v); + if(uIdx == 1) // YVYU + { + swap(u, v); + } + } + + v_int32 ruv[4], guv[4], buv[4]; + uvToRGBuv(u, v, ruv, guv, buv); - for (int i = 0; i < 2 * width; i += 4, row += dcn*2) + v_uint8 r[2], g[2], b[2]; + + yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]); + yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]); + + if(bIdx) + { + swap(r[0], b[0]); + swap(r[1], b[1]); + } + + // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] + v_uint8 r0_0, r0_1; + v_zip(r[0], r[1], r0_0, r0_1); + v_uint8 g0_0, g0_1; + v_zip(g[0], g[1], g0_0, g0_1); + v_uint8 b0_0, b0_1; + v_zip(b[0], b[1], b0_0, b0_1); + + if(dcn == 4) + { + v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a); + v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a); + } + else //dcn == 3 + { + v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0); + v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1); + } + } + vx_cleanup(); +#endif + for (; i < 2 * width; i += 4, row += dcn*2) { - int u = int(yuv_src[i + uidx]); - int v = int(yuv_src[i + vidx]); + uchar u = yuv_src[i + uidx]; + uchar v = yuv_src[i + vidx]; - int vy0 = int(yuv_src[i + yIdx]); - int vy1 = int(yuv_src[i + yIdx + 2]); + uchar vy0 = yuv_src[i + yIdx]; + uchar vy1 = yuv_src[i + yIdx + 2]; cvtYuv42xxp2RGB8(u, v, vy0, vy1, 0, 0, row, (uchar*)(0)); } diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp index 72efd6350a..5022473613 100644 --- a/modules/js/src/core_bindings.cpp +++ b/modules/js/src/core_bindings.cpp @@ -289,13 +289,16 @@ namespace binding_utils float radius; }; +#ifdef HAVE_OPENCV_IMGPROC Circle minEnclosingCircle(const cv::Mat& points) { Circle circle; cv::minEnclosingCircle(points, circle.center, circle.radius); return circle; } +#endif +#ifdef HAVE_OPENCV_VIDEO emscripten::val CamShiftWrapper(const cv::Mat& arg1, Rect& arg2, TermCriteria arg3) { RotatedRect rotatedRect = cv::CamShift(arg1, arg2, arg3); @@ -313,6 +316,7 @@ namespace binding_utils result.call("push", arg2); return result; } +#endif // HAVE_OPENCV_VIDEO std::string getExceptionMsg(const cv::Exception& e) { return e.msg; @@ -551,19 +555,25 @@ EMSCRIPTEN_BINDINGS(binding_utils) function("exceptionFromPtr", &binding_utils::exceptionFromPtr, allow_raw_pointers()); +#ifdef HAVE_OPENCV_IMGPROC function("minEnclosingCircle", select_overload(&binding_utils::minEnclosingCircle)); +#endif function("minMaxLoc", select_overload(&binding_utils::minMaxLoc)); function("minMaxLoc", select_overload(&binding_utils::minMaxLoc_1)); +#ifdef HAVE_OPENCV_IMGPROC function("morphologyDefaultBorderValue", &cv::morphologyDefaultBorderValue); +#endif function("CV_MAT_DEPTH", &binding_utils::cvMatDepth); +#ifdef HAVE_OPENCV_VIDEO function("CamShift", select_overload(&binding_utils::CamShiftWrapper)); function("meanShift", select_overload(&binding_utils::meanShiftWrapper)); +#endif function("getBuildInformation", &binding_utils::getBuildInformation); diff --git a/modules/js/src/embindgen.py b/modules/js/src/embindgen.py index 8e406e0388..03dc0a65ae 100644 --- a/modules/js/src/embindgen.py +++ b/modules/js/src/embindgen.py @@ -140,7 +140,7 @@ features2d = {'Feature2D': ['detect', 'compute', 'detectAndCompute', 'descriptor 'AKAZE': ['create', 'setDescriptorType', 'getDescriptorType', 'setDescriptorSize', 'getDescriptorSize', 'setDescriptorChannels', 'getDescriptorChannels', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'], 'DescriptorMatcher': ['add', 'clear', 'empty', 'isMaskSupported', 'train', 'match', 'knnMatch', 'radiusMatch', 'clone', 'create'], 'BFMatcher': ['isMaskSupported', 'create'], - '': ['FAST', 'AGAST', 'drawKeypoints', 'drawMatches']} + '': ['drawKeypoints', 'drawMatches']} photo = {'': ['createAlignMTB', 'createCalibrateDebevec', 'createCalibrateRobertson', \ 'createMergeDebevec', 'createMergeMertens', 'createMergeRobertson', \ diff --git a/modules/js/test/test_features2d.js b/modules/js/test/test_features2d.js new file mode 100644 index 0000000000..21982f65f8 --- /dev/null +++ b/modules/js/test/test_features2d.js @@ -0,0 +1,82 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +if (typeof module !== 'undefined' && module.exports) { + // The envrionment is Node.js + var cv = require('./opencv.js'); // eslint-disable-line no-var +} + +function generateTestFrame(width, height) { + let w = width || 200; + let h = height || 200; + let img = new cv.Mat(h, w, cv.CV_8UC1, new cv.Scalar(0, 0, 0, 0)); + let s = new cv.Scalar(255, 255, 255, 255); + let s128 = new cv.Scalar(128, 128, 128, 128); + let rect = new cv.Rect(w / 4, h / 4, w / 2, h / 2); + img.roi(rect).setTo(s); + img.roi(new cv.Rect(w / 2 - w / 8, h / 2 - h / 8, w / 4, h / 4)).setTo(s128); + cv.rectangle(img, new cv.Point(w / 8, h / 8), new cv.Point(w - w / 8, h - h / 8), s, 5); + cv.rectangle(img, new cv.Point(w / 5, h / 5), new cv.Point(w - w / 5, h - h / 5), s128, 3); + cv.line(img, new cv.Point(-w, 0), new cv.Point(w / 2, h / 2), s128, 5); + cv.line(img, new cv.Point(2*w, 0), new cv.Point(w / 2, h / 2), s, 5); + return img; +} + +QUnit.module('Features2D', {}); +QUnit.test('Detectors', function(assert) { + let image = generateTestFrame(); + + let kp = new cv.KeyPointVector(); + + let orb = new cv.ORB(); + orb.detect(image, kp); + assert.equal(kp.size(), 67, 'ORB'); + + let mser = new cv.MSER(); + mser.detect(image, kp); + assert.equal(kp.size(), 7, 'MSER'); + + let brisk = new cv.BRISK(); + brisk.detect(image, kp); + assert.equal(kp.size(), 191, 'BRISK'); + + let ffd = new cv.FastFeatureDetector(); + ffd.detect(image, kp); + assert.equal(kp.size(), 12, 'FastFeatureDetector'); + + let afd = new cv.AgastFeatureDetector(); + afd.detect(image, kp); + assert.equal(kp.size(), 67, 'AgastFeatureDetector'); + + let gftt = new cv.GFTTDetector(); + gftt.detect(image, kp); + assert.equal(kp.size(), 168, 'GFTTDetector'); + + let kaze = new cv.KAZE(); + kaze.detect(image, kp); + assert.equal(kp.size(), 159, 'KAZE'); + + let akaze = new cv.AKAZE(); + akaze.detect(image, kp); + assert.equal(kp.size(), 52, 'AKAZE'); +}); + +QUnit.test('BFMatcher', function(assert) { + // Generate key points. + let image = generateTestFrame(); + + let kp = new cv.KeyPointVector(); + let descriptors = new cv.Mat(); + let orb = new cv.ORB(); + orb.detectAndCompute(image, new cv.Mat(), kp, descriptors); + + assert.equal(kp.size(), 67); + + // Run a matcher. + let dm = new cv.DMatchVector(); + let matcher = new cv.BFMatcher(); + matcher.match(descriptors, descriptors, dm); + + assert.equal(dm.size(), 67); +}); diff --git a/modules/js/test/tests.html b/modules/js/test/tests.html index 8f65ce7f66..305220c1f2 100644 --- a/modules/js/test/tests.html +++ b/modules/js/test/tests.html @@ -29,6 +29,7 @@ + - - - diff --git a/modules/js/test/tests.js b/modules/js/test/tests.js index 2977032b4c..0acfa9eabb 100644 --- a/modules/js/test/tests.js +++ b/modules/js/test/tests.js @@ -45,7 +45,8 @@ testrunner.run( { code: 'opencv.js', tests: ['test_mat.js', 'test_utils.js', 'test_imgproc.js', - 'test_objdetect.js', 'test_video.js', 'test_photo.js'], + 'test_objdetect.js', 'test_video.js', 'test_features2d.js', + 'test_photo.js'], }, function(err, report) { console.log(report.failed + ' failed, ' + report.passed + ' passed'); diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp index e06dbbf379..83ad47fc0e 100644 --- a/modules/video/src/optflowgf.cpp +++ b/modules/video/src/optflowgf.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencl_kernels_video.hpp" +#include "opencv2/core/hal/intrin.hpp" #if defined __APPLE__ || defined __ANDROID__ #define SMALL_LOCALSIZE @@ -433,13 +434,11 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, for( i = 0; i <= m; i++ ) kernel[i] = (float)(kernel[i]*s); -#if CV_SSE2 +#if CV_SIMD128 float* simd_kernel = alignPtr(kernel + m+1, 16); - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); - if( useSIMD ) { for( i = 0; i <= m; i++ ) - _mm_store_ps(simd_kernel + i*4, _mm_set1_ps(kernel[i])); + v_store(simd_kernel + i*4, v_setall_f32(kernel[i])); } #endif @@ -457,54 +456,53 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, } x = 0; -#if CV_SSE2 - if( useSIMD ) +#if CV_SIMD128 { for( ; x <= width*5 - 16; x += 16 ) { const float *sptr0 = srow[m], *sptr1; - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0, s1, s2, s3; - s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); - s1 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 4), g4); - s2 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 8), g4); - s3 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 12), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0, s1, s2, s3; + s0 = v_load(sptr0 + x) * g4; + s1 = v_load(sptr0 + x + 4) * g4; + s2 = v_load(sptr0 + x + 8) * g4; + s3 = v_load(sptr0 + x + 12) * g4; for( i = 1; i <= m; i++ ) { - __m128 x0, x1; + v_float32x4 x0, x1; sptr0 = srow[m+i], sptr1 = srow[m-i]; - g4 = _mm_load_ps(simd_kernel + i*4); - x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); - x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 4), _mm_loadu_ps(sptr1 + x + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4)); - x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 8), _mm_loadu_ps(sptr1 + x + 8)); - x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 12), _mm_loadu_ps(sptr1 + x + 12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, g4)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, g4)); + g4 = v_load(simd_kernel + i*4); + x0 = v_load(sptr0 + x) + v_load(sptr1 + x); + x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4); + s0 = v_muladd(x0, g4, s0); + s1 = v_muladd(x1, g4, s1); + x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8); + x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12); + s2 = v_muladd(x0, g4, s2); + s3 = v_muladd(x1, g4, s3); } - _mm_store_ps(vsum + x, s0); - _mm_store_ps(vsum + x + 4, s1); - _mm_store_ps(vsum + x + 8, s2); - _mm_store_ps(vsum + x + 12, s3); + v_store(vsum + x, s0); + v_store(vsum + x + 4, s1); + v_store(vsum + x + 8, s2); + v_store(vsum + x + 12, s3); } for( ; x <= width*5 - 4; x += 4 ) { const float *sptr0 = srow[m], *sptr1; - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0 = v_load(sptr0 + x) * g4; for( i = 1; i <= m; i++ ) { sptr0 = srow[m+i], sptr1 = srow[m-i]; - g4 = _mm_load_ps(simd_kernel + i*4); - __m128 x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); + g4 = v_load(simd_kernel + i*4); + v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x); + s0 = v_muladd(x0, g4, s0); } - _mm_store_ps(vsum + x, s0); + v_store(vsum + x, s0); } } #endif @@ -525,28 +523,25 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, // horizontal blur x = 0; -#if CV_SSE2 - if( useSIMD ) +#if CV_SIMD128 { for( ; x <= width*5 - 8; x += 8 ) { - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0 = _mm_mul_ps(_mm_loadu_ps(vsum + x), g4); - __m128 s1 = _mm_mul_ps(_mm_loadu_ps(vsum + x + 4), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0 = v_load(vsum + x) * g4; + v_float32x4 s1 = v_load(vsum + x + 4) * g4; for( i = 1; i <= m; i++ ) { - g4 = _mm_load_ps(simd_kernel + i*4); - __m128 x0 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5), - _mm_loadu_ps(vsum + x + i*5)); - __m128 x1 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5 + 4), - _mm_loadu_ps(vsum + x + i*5 + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4)); + g4 = v_load(simd_kernel + i*4); + v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5); + v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4); + s0 = v_muladd(x0, g4, s0); + s1 = v_muladd(x1, g4, s1); } - _mm_store_ps(hsum + x, s0); - _mm_store_ps(hsum + x + 4, s1); + v_store(hsum + x, s0); + v_store(hsum + x + 4, s1); } } #endif diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py index fd0cffa53d..fb98b76238 100644 --- a/platforms/js/build_js.py +++ b/platforms/js/build_js.py @@ -113,6 +113,7 @@ class Builder: "-DWITH_GPHOTO2=OFF", "-DWITH_LAPACK=OFF", "-DWITH_ITT=OFF", + "-DWITH_QUIRC=OFF", "-DBUILD_ZLIB=ON", "-DBUILD_opencv_apps=OFF", "-DBUILD_opencv_calib3d=ON", # No bindings provided. This module is used as a dependency for other modules. @@ -130,9 +131,11 @@ class Builder: "-DBUILD_opencv_superres=OFF", "-DBUILD_opencv_stitching=OFF", "-DBUILD_opencv_java=OFF", + "-DBUILD_opencv_java_bindings_generator=OFF", "-DBUILD_opencv_js=ON", "-DBUILD_opencv_python2=OFF", "-DBUILD_opencv_python3=OFF", + "-DBUILD_opencv_python_bindings_generator=OFF", "-DBUILD_EXAMPLES=OFF", "-DBUILD_PACKAGE=OFF", "-DBUILD_TESTS=OFF", diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp index 94a6666393..c30e2179af 100644 --- a/samples/dnn/object_detection.cpp +++ b/samples/dnn/object_detection.cpp @@ -153,51 +153,39 @@ void postprocess(Mat& frame, const std::vector& outs, Net& net) std::vector classIds; std::vector confidences; std::vector boxes; - if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN + if (outLayerType == "DetectionOutput") { // Network produces output blob with a shape 1x1xNx7 where N is a number of // detections and an every detection is a vector of values // [batchId, classId, confidence, left, top, right, bottom] - CV_Assert(outs.size() == 1); - float* data = (float*)outs[0].data; - for (size_t i = 0; i < outs[0].total(); i += 7) + CV_Assert(outs.size() > 0); + for (size_t k = 0; k < outs.size(); k++) { - float confidence = data[i + 2]; - if (confidence > confThreshold) + float* data = (float*)outs[k].data; + for (size_t i = 0; i < outs[k].total(); i += 7) { - int left = (int)data[i + 3]; - int top = (int)data[i + 4]; - int right = (int)data[i + 5]; - int bottom = (int)data[i + 6]; - int width = right - left + 1; - int height = bottom - top + 1; - classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id. - boxes.push_back(Rect(left, top, width, height)); - confidences.push_back(confidence); - } - } - } - else if (outLayerType == "DetectionOutput") - { - // Network produces output blob with a shape 1x1xNx7 where N is a number of - // detections and an every detection is a vector of values - // [batchId, classId, confidence, left, top, right, bottom] - CV_Assert(outs.size() == 1); - float* data = (float*)outs[0].data; - for (size_t i = 0; i < outs[0].total(); i += 7) - { - float confidence = data[i + 2]; - if (confidence > confThreshold) - { - int left = (int)(data[i + 3] * frame.cols); - int top = (int)(data[i + 4] * frame.rows); - int right = (int)(data[i + 5] * frame.cols); - int bottom = (int)(data[i + 6] * frame.rows); - int width = right - left + 1; - int height = bottom - top + 1; - classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id. - boxes.push_back(Rect(left, top, width, height)); - confidences.push_back(confidence); + float confidence = data[i + 2]; + if (confidence > confThreshold) + { + int left = (int)data[i + 3]; + int top = (int)data[i + 4]; + int right = (int)data[i + 5]; + int bottom = (int)data[i + 6]; + int width = right - left + 1; + int height = bottom - top + 1; + if (width * height <= 1) + { + left = (int)(data[i + 3] * frame.cols); + top = (int)(data[i + 4] * frame.rows); + right = (int)(data[i + 5] * frame.cols); + bottom = (int)(data[i + 6] * frame.rows); + width = right - left + 1; + height = bottom - top + 1; + } + classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id. + boxes.push_back(Rect(left, top, width, height)); + confidences.push_back(confidence); + } } } } diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py index bf1c2e4236..3f7b0e23d7 100644 --- a/samples/dnn/object_detection.py +++ b/samples/dnn/object_detection.py @@ -102,7 +102,7 @@ def postprocess(frame, outs): classIds = [] confidences = [] boxes = [] - if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN + if lastLayer.type == 'DetectionOutput': # Network produces output blob with a shape 1x1xNx7 where N is a number of # detections and an every detection is a vector of values # [batchId, classId, confidence, left, top, right, bottom] @@ -116,23 +116,13 @@ def postprocess(frame, outs): bottom = int(detection[6]) width = right - left + 1 height = bottom - top + 1 - classIds.append(int(detection[1]) - 1) # Skip background label - confidences.append(float(confidence)) - boxes.append([left, top, width, height]) - elif lastLayer.type == 'DetectionOutput': - # Network produces output blob with a shape 1x1xNx7 where N is a number of - # detections and an every detection is a vector of values - # [batchId, classId, confidence, left, top, right, bottom] - for out in outs: - for detection in out[0, 0]: - confidence = detection[2] - if confidence > confThreshold: - left = int(detection[3] * frameWidth) - top = int(detection[4] * frameHeight) - right = int(detection[5] * frameWidth) - bottom = int(detection[6] * frameHeight) - width = right - left + 1 - height = bottom - top + 1 + if width * height <= 1: + left = int(detection[3] * frameWidth) + top = int(detection[4] * frameHeight) + right = int(detection[5] * frameWidth) + bottom = int(detection[6] * frameHeight) + width = right - left + 1 + height = bottom - top + 1 classIds.append(int(detection[1]) - 1) # Skip background label confidences.append(float(confidence)) boxes.append([left, top, width, height])