diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 65f2f3d7db..30f2ce5649 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -900,7 +900,9 @@ inline hfloat hfloatFromBits(ushort w) { #endif } +#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON) typedef hfloat float16_t; +#endif } #endif diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp index 22d7d5194a..1734dccc63 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp @@ -494,10 +494,9 @@ void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width, const int convMR_fp16, const int convNR_fp16) { - typedef __fp16 float16_t; - const float16_t* a = (const float16_t*)_a; - const float16_t* b = (const float16_t*)_b; - float16_t* c = (float16_t*)_c; + const __fp16* a = (const __fp16*)_a; + const __fp16* b = (const __fp16*)_b; + __fp16* c = (__fp16*)_c; CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24); float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00; @@ -638,12 +637,11 @@ void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, void convBlockMR1_F16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c, const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16) { - typedef __fp16 float16_t; CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24 - const float16_t* a = (const float16_t*)_a; - const float16_t* b = (const float16_t*)_b; + const __fp16* a = (const __fp16*)_a; + const __fp16* b = (const __fp16*)_b; - const float16_t bias = (float16_t)_bias; + const __fp16 bias = (__fp16)_bias; float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0; diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp index d19cec64de..46e220e69f 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp @@ -85,7 +85,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu // works at FP 16. CONV_WINO_ATOM = CONV_WINO_ATOM_F16; CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16; - esz = sizeof(float16_t); + esz = sizeof(__fp16); } #endif diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp index d1f1610280..e44d0f8004 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp @@ -435,10 +435,9 @@ void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16) { - typedef __fp16 float16_t; - const float16_t* inwptr = (const float16_t*)_inwptr; - const float16_t* wptr = (const float16_t*)_wptr; - float16_t* outbuf = (float16_t*)_outbuf; + const __fp16* inwptr = (const __fp16*)_inwptr; + const __fp16* wptr = (const __fp16*)_wptr; + __fp16* outbuf = (__fp16*)_outbuf; CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF16 == 8); @@ -591,8 +590,7 @@ void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, i void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep, char * _outptr, int Cg, const int winoIblock, const int winoAtomF16) { - typedef __fp16 float16_t; - float16_t* outptr = (float16_t*)_outptr; + __fp16* outptr = (__fp16*)_outptr; float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); @@ -757,8 +755,7 @@ void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep, float * bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct) { - typedef __fp16 float16_t; - const float16_t* inptr = (const float16_t*)_inptr; + const __fp16* inptr = (const __fp16*)_inptr; float32x4_t x00 = vcvt_f32_f16(vld1_f16(inptr)), x01 = vcvt_f32_f16(vld1_f16(inptr + 4)); float32x4_t x10 = vcvt_f32_f16(vld1_f16(inptr + inpstep)), x11 = vcvt_f32_f16(vld1_f16(inptr + inpstep + 4)); diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp index 7bbcf1e8e8..33fb62a47b 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp @@ -26,7 +26,7 @@ void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const fl #ifdef CONV_ARM_FP16 // Fast convert float 32 to float16 -static inline void _cvt32f16f(const float* src, float16_t* dst, int len) +static inline void _cvt32f16f(const float* src, __fp16* dst, int len) { int j = 0; const int VECSZ = 4; @@ -60,7 +60,7 @@ static inline void _cvt32f16f(const float* src, float16_t* dst, int len) vst1_f16(dst_FP16 + j, hv); } for( ; j < len; j++ ) - dst[j] = float16_t(src[j]); + dst[j] = __fp16(src[j]); } #endif @@ -74,12 +74,12 @@ float* FastConv::getWeightsWino() return alignPtr(weightsWinoBuf.data(), VEC_ALIGN); } -float16_t* FastConv::getWeightsFP16() +hfloat* FastConv::getWeightsFP16() { return alignPtr(weightsBuf_FP16.data(), VEC_ALIGN); } -float16_t* FastConv::getWeightsWinoFP16() +hfloat* FastConv::getWeightsWinoFP16() { return alignPtr(weightsWinoBuf_FP16.data(), VEC_ALIGN); } @@ -209,7 +209,7 @@ Ptr initFastConv( if (conv->useFP16) { conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN); - auto weightsPtr_FP16 = conv->getWeightsFP16(); + auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16(); parallel_for_(Range(0, C), [&](const Range& r0){ for(int c = r0.start; c < r0.end; c++) @@ -269,11 +269,11 @@ Ptr initFastConv( float* wptrWino = nullptr; #ifdef CONV_ARM_FP16 - float16_t* wptrWino_FP16 = nullptr; + __fp16* wptrWino_FP16 = nullptr; if (conv->useFP16) { conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN); - wptrWino_FP16 = conv->getWeightsWinoFP16(); + wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16(); } else #endif @@ -323,7 +323,7 @@ Ptr initFastConv( #ifdef CONV_ARM_FP16 if (conv->useFP16) { - float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + + __fp16* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16; for (int i = 0; i < CONV_WINO_NATOMS_F16; i++, wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16) @@ -331,7 +331,7 @@ Ptr initFastConv( CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights); for (int j = 0; j < CONV_WINO_ATOM_F16; j++) { - wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j]; + wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j]; } } } @@ -367,12 +367,12 @@ Ptr initFastConv( int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16; int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16; size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg; - float16_t* weightsPtr_FP16 = nullptr; + __fp16* weightsPtr_FP16 = nullptr; if (conv->useFP16) { conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN); - weightsPtr_FP16 = conv->getWeightsFP16(); + weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16(); } else #endif @@ -394,7 +394,7 @@ Ptr initFastConv( int startK = si * CONV_MR_FP16; CV_Assert(startK < Kg_aligned_FP16); - float16_t* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16); + __fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16); int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding. int k_idx = g*Kg + startK; @@ -405,9 +405,9 @@ Ptr initFastConv( const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd; int k = 0; for(; k < dk; k++, wptr += wstep) - packed_wptr[k] = (float16_t)(*wptr); + packed_wptr[k] = (__fp16)(*wptr); for(; k < CONV_MR_FP16; k++) - packed_wptr[k] = (float16_t)0.f; + packed_wptr[k] = (__fp16)0.f; } } }}); @@ -467,8 +467,8 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, float* inptrInC = (float* )inptrIn; #ifdef CONV_ARM_FP16 - float16_t* inpbufC_FP16 = (float16_t *)inpbufC; - if (esz == sizeof(float16_t)) + __fp16* inpbufC_FP16 = (__fp16 *)inpbufC; + if (esz == sizeof(__fp16)) { if (stride_w == 1) { @@ -565,16 +565,16 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0 float* inptrInC = inptrIn; #ifdef CONV_ARM_FP16 - float16_t* inpbufC_FP16 = (float16_t *)inpbufC; - if (esz == sizeof(float16_t)) + __fp16* inpbufC_FP16 = (__fp16 *)inpbufC; + if (esz == sizeof(__fp16)) { for (int k = 0; k < ksize; k++) { int k1 = ofstab[k]; float v0 = inptrInC[k1]; float v1 = inptrInC[k1 + stride_w]; - inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0; - inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1; + inpbufC_FP16[k*CONV_NR_FP16] = (__fp16)v0; + inpbufC_FP16[k*CONV_NR_FP16+1] = (__fp16)v1; } } else #endif @@ -630,7 +630,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta if (useFP16) { for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) - _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR); + _cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR); } else #endif @@ -644,7 +644,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta { for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) { - _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len); + _cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len); } } else @@ -704,11 +704,11 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta #ifdef CONV_ARM_FP16 if (useFP16) { - float16_t* inpbufC = (float16_t *)inpbuf + s0; + __fp16* inpbufC = (__fp16 *)inpbuf + s0; for (int w = w0; w < w1; w++) { int imgofs = w*dilation_w; - inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs]; + inpbufC[w*CONV_NR] = (__fp16)inptrInC[imgofs]; } } else @@ -765,14 +765,14 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta #ifdef CONV_ARM_FP16 if (useFP16) { - float16_t* inpbufC = (float16_t *)inpbuf + s0; + __fp16* inpbufC = (__fp16 *)inpbuf + s0; for (int h = h0; h < h1; h++) { for (int w = w0; w < w1; w++) { int imgofs = h*(dilation_h*Wi) + w*dilation_w; - inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs]; + inpbufC[(h*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs]; } } } @@ -838,7 +838,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta #ifdef CONV_ARM_FP16 if (useFP16) { - float16_t* inpbufC = (float16_t* )inpbuf + s0; + __fp16* inpbufC = (__fp16* )inpbuf + s0; for ( int d = d0; d < d1; d++) { @@ -847,7 +847,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta for (int w = w0; w < w1; w++) { int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w; - inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs]; + inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs]; } } } @@ -889,7 +889,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta { float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i; #ifdef CONV_ARM_FP16 - float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i; + __fp16 * inpbuf_ki_FP16 = (__fp16 *)inpbuf + k * CONV_NR * Cg + i; #endif int zi = z0 * stride_d + dz - pad_front; @@ -1053,7 +1053,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta if (useFP16) { for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) - inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki); + inpbuf_ki_FP16[0] = (__fp16)(*inptr_ki); } else #endif @@ -1069,7 +1069,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta if (useFP16) { for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR) - inpbuf_ki_FP16[0] = (float16_t)0.f; + inpbuf_ki_FP16[0] = (__fp16)0.f; } else #endif @@ -1257,7 +1257,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co // works at FP 16. CONV_NR = CONV_NR_FP16; CONV_MR = CONV_MR_FP16; - esz = sizeof(float16_t); + esz = sizeof(__fp16); } #endif @@ -1511,7 +1511,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz; float *cptr = cbuf_task + stripe * CONV_NR; - float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR; + hfloat* cptr_f16 = (hfloat*)cbuf_task + stripe*CONV_NR; for (int k = k0_block; k < k1_block; k += CONV_MR, wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc) { @@ -1547,7 +1547,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0; const float *cptr = cbuf_task; - const float16_t *cptr_fp16 = (const float16_t *)cbuf_task; + const hfloat *cptr_fp16 = (const hfloat *)cbuf_task; float *outptr = out + outofs; const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0; diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp index e9f169bbaf..5c8055337c 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp @@ -62,10 +62,10 @@ struct FastConv float* getWeights(); float* getWeightsWino(); - std::vector weightsBuf_FP16; - std::vector weightsWinoBuf_FP16; - float16_t* getWeightsFP16(); - float16_t* getWeightsWinoFP16(); + std::vector weightsBuf_FP16; + std::vector weightsWinoBuf_FP16; + hfloat* getWeightsFP16(); + hfloat* getWeightsWinoFP16(); int conv_type; int conv_dim; // Flag for conv1d, conv2d, or conv3d. diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp index a87910a4c4..4b857ebc16 100644 --- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp +++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp @@ -1742,12 +1742,12 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto) #endif const ::google::protobuf::RepeatedField field = tensor_proto.int32_data(); - AutoBuffer aligned_val; + AutoBuffer aligned_val; size_t sz = tensor_proto.int32_data().size(); aligned_val.allocate(sz); - float16_t* bufPtr = aligned_val.data(); + hfloat* bufPtr = aligned_val.data(); - float16_t *fp16Ptr = (float16_t *)field.data(); + hfloat *fp16Ptr = (hfloat *)field.data(); for (int i = 0; i < sz; i++) { bufPtr[i] = fp16Ptr[i*2 + offset]; @@ -1759,11 +1759,11 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto) char* val = const_cast(tensor_proto.raw_data().c_str()); #if CV_STRONG_ALIGNMENT // Aligned pointer is required. - AutoBuffer aligned_val; - if (!isAligned(val)) + AutoBuffer aligned_val; + if (!isAligned(val)) { size_t sz = tensor_proto.raw_data().size(); - aligned_val.allocate(divUp(sz, sizeof(float16_t))); + aligned_val.allocate(divUp(sz, sizeof(hfloat))); memcpy(aligned_val.data(), val, sz); val = (char*)aligned_val.data(); }