Merge pull request #25387 from fengyuentau:complete-float16_t-renaming

Rename remaining float16_t for future proof #25387 Resolves comment: https://github.com/opencv/opencv/pull/25217#discussion_r1547733187. `std::float16_t` and `std::bfloat16_t` are introduced since c++23: https://en.cppreference.com/w/cpp/types/floating-point. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
1 year ago · 197626a5bf
parent 2c5b296ab2
commit 197626a5bf
7 changed files with 58 additions and 61 deletions
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -900,7 +900,9 @@ inline hfloat hfloatFromBits(ushort w) {
 #endif
 }

+#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON)
 typedef hfloat float16_t;
+#endif

 }
 #endif
--- a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@ -494,10 +494,9 @@ void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const
 void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
                    const int convMR_fp16, const int convNR_fp16)
 {
-    typedef __fp16 float16_t;
-    const float16_t* a = (const float16_t*)_a;
-    const float16_t* b = (const float16_t*)_b;
-    float16_t* c = (float16_t*)_c;
+    const __fp16* a = (const __fp16*)_a;
+    const __fp16* b = (const __fp16*)_b;
+    __fp16* c = (__fp16*)_c;
    CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);

    float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
@ -638,12 +637,11 @@ void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc,
 void convBlockMR1_F16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
                       const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
 {
-    typedef __fp16 float16_t;
    CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
-    const float16_t* a = (const float16_t*)_a;
-    const float16_t* b = (const float16_t*)_b;
+    const __fp16* a = (const __fp16*)_a;
+    const __fp16* b = (const __fp16*)_b;

-    const float16_t bias = (float16_t)_bias;
+    const __fp16 bias = (__fp16)_bias;

    float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;

--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@ -85,7 +85,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
        // works at FP 16.
        CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
        CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
-        esz = sizeof(float16_t);
+        esz = sizeof(__fp16);
    }
 #endif

--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
@ -435,10 +435,9 @@ void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
 void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock,
                        const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16)
 {
-    typedef __fp16 float16_t;
-    const float16_t* inwptr = (const float16_t*)_inwptr;
-    const float16_t* wptr = (const float16_t*)_wptr;
-    float16_t* outbuf = (float16_t*)_outbuf;
+    const __fp16* inwptr = (const __fp16*)_inwptr;
+    const __fp16* wptr = (const __fp16*)_wptr;
+    __fp16* outbuf = (__fp16*)_outbuf;

    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF16 == 8);

@ -591,8 +590,7 @@ void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, i
 void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep,
                           char * _outptr, int Cg, const int winoIblock, const int winoAtomF16)
 {
-    typedef __fp16 float16_t;
-    float16_t* outptr = (float16_t*)_outptr;
+    __fp16* outptr = (__fp16*)_outptr;
    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
@ -757,8 +755,7 @@ void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep,
                           float * bpptr, int bpstep, float* outptr, int outstep,
                           float bias, float minval, float maxval, bool ifMinMaxAct)
 {
-    typedef __fp16 float16_t;
-    const float16_t* inptr = (const float16_t*)_inptr;
+    const __fp16* inptr = (const __fp16*)_inptr;

    float32x4_t x00 = vcvt_f32_f16(vld1_f16(inptr)), x01 = vcvt_f32_f16(vld1_f16(inptr + 4));
    float32x4_t x10 = vcvt_f32_f16(vld1_f16(inptr + inpstep)), x11 = vcvt_f32_f16(vld1_f16(inptr + inpstep + 4));
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@ -26,7 +26,7 @@ void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const fl

 #ifdef CONV_ARM_FP16
 // Fast convert float 32 to float16
-static inline void _cvt32f16f(const float* src, float16_t* dst, int len)
+static inline void _cvt32f16f(const float* src, __fp16* dst, int len)
 {
    int j = 0;
    const int VECSZ = 4;
@ -60,7 +60,7 @@ static inline void _cvt32f16f(const float* src, float16_t* dst, int len)
        vst1_f16(dst_FP16 + j, hv);
    }
    for( ; j < len; j++ )
-        dst[j] = float16_t(src[j]);
+        dst[j] = __fp16(src[j]);
 }
 #endif

@ -74,12 +74,12 @@ float* FastConv::getWeightsWino()
    return alignPtr(weightsWinoBuf.data(), VEC_ALIGN);
 }

-float16_t* FastConv::getWeightsFP16()
+hfloat* FastConv::getWeightsFP16()
 {
    return alignPtr(weightsBuf_FP16.data(), VEC_ALIGN);
 }

-float16_t* FastConv::getWeightsWinoFP16()
+hfloat* FastConv::getWeightsWinoFP16()
 {
    return alignPtr(weightsWinoBuf_FP16.data(), VEC_ALIGN);
 }
@ -209,7 +209,7 @@ Ptr<FastConv> initFastConv(
        if (conv->useFP16)
        {
            conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
-            auto weightsPtr_FP16 = conv->getWeightsFP16();
+            auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();

            parallel_for_(Range(0, C), [&](const Range& r0){
                for(int c = r0.start; c < r0.end; c++)
@ -269,11 +269,11 @@ Ptr<FastConv> initFastConv(

        float* wptrWino = nullptr;
 #ifdef CONV_ARM_FP16
-        float16_t* wptrWino_FP16 = nullptr;
+        __fp16* wptrWino_FP16 = nullptr;
        if (conv->useFP16)
        {
            conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
-            wptrWino_FP16 = conv->getWeightsWinoFP16();
+            wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16();
        }
        else
 #endif
@ -323,7 +323,7 @@ Ptr<FastConv> initFastConv(
 #ifdef CONV_ARM_FP16
                if (conv->useFP16)
                {
-                    float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
+                    __fp16* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
                                  (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
                    for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
                            wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
@ -331,7 +331,7 @@ Ptr<FastConv> initFastConv(
                        CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights);
                        for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
                        {
-                            wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
+                            wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j];
                        }
                    }
                }
@ -367,12 +367,12 @@ Ptr<FastConv> initFastConv(
        int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
        int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
        size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
-        float16_t* weightsPtr_FP16 = nullptr;
+        __fp16* weightsPtr_FP16 = nullptr;

        if (conv->useFP16)
        {
            conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
-            weightsPtr_FP16 = conv->getWeightsFP16();
+            weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
        }
        else
 #endif
@ -394,7 +394,7 @@ Ptr<FastConv> initFastConv(
                int startK = si * CONV_MR_FP16;
                CV_Assert(startK < Kg_aligned_FP16);

-                float16_t* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
+                __fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
                int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.

                int k_idx = g*Kg + startK;
@ -405,9 +405,9 @@ Ptr<FastConv> initFastConv(
                        const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
                        int k = 0;
                        for(; k < dk; k++, wptr += wstep)
-                            packed_wptr[k] = (float16_t)(*wptr);
+                            packed_wptr[k] = (__fp16)(*wptr);
                        for(; k < CONV_MR_FP16; k++)
-                            packed_wptr[k] = (float16_t)0.f;
+                            packed_wptr[k] = (__fp16)0.f;
                    }
                }
            }});
@ -467,8 +467,8 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0,
    float* inptrInC = (float* )inptrIn;

 #ifdef CONV_ARM_FP16
-    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
-    if (esz == sizeof(float16_t))
+    __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
+    if (esz == sizeof(__fp16))
    {
        if (stride_w == 1)
        {
@ -565,16 +565,16 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0
    float* inptrInC = inptrIn;

 #ifdef CONV_ARM_FP16
-    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
-    if (esz == sizeof(float16_t))
+    __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
+    if (esz == sizeof(__fp16))
    {
        for (int k = 0; k < ksize; k++)
        {
            int k1 = ofstab[k];
            float v0 = inptrInC[k1];
            float v1 = inptrInC[k1 + stride_w];
-            inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
-            inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
+            inpbufC_FP16[k*CONV_NR_FP16] = (__fp16)v0;
+            inpbufC_FP16[k*CONV_NR_FP16+1] = (__fp16)v1;
        }
    } else
 #endif
@ -630,7 +630,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                if (useFP16)
                {
                    for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
-                        _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
+                        _cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR);
                }
                else
 #endif
@ -644,7 +644,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                {
                    for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
                    {
-                        _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
+                        _cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len);
                    }
                }
                else
@ -704,11 +704,11 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                            if (useFP16)
                            {
-                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
+                                __fp16* inpbufC = (__fp16 *)inpbuf + s0;
                                for (int w = w0; w < w1; w++)
                                {
                                    int imgofs = w*dilation_w;
-                                    inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                    inpbufC[w*CONV_NR] = (__fp16)inptrInC[imgofs];
                                }
                            }
                            else
@ -765,14 +765,14 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                            if (useFP16)
                            {
-                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
+                                __fp16* inpbufC = (__fp16 *)inpbuf + s0;

                                for (int h = h0; h < h1; h++)
                                {
                                    for (int w = w0; w < w1; w++)
                                    {
                                        int imgofs = h*(dilation_h*Wi) + w*dilation_w;
-                                        inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                        inpbufC[(h*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
                                    }
                                }
                            }
@ -838,7 +838,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                            if (useFP16)
                            {
-                                float16_t* inpbufC = (float16_t* )inpbuf + s0;
+                                __fp16* inpbufC = (__fp16* )inpbuf + s0;

                                for ( int d = d0; d < d1; d++)
                                {
@ -847,7 +847,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                                        for (int w = w0; w < w1; w++)
                                        {
                                            int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
-                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
                                        }
                                    }
                                }
@ -889,7 +889,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                {
                    float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
 #ifdef CONV_ARM_FP16
-                    float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
+                    __fp16 * inpbuf_ki_FP16 = (__fp16 *)inpbuf + k * CONV_NR * Cg + i;
 #endif

                    int zi = z0 * stride_d + dz - pad_front;
@ -1053,7 +1053,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                            if (useFP16)
                            {
                                for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
-                                    inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
+                                    inpbuf_ki_FP16[0] = (__fp16)(*inptr_ki);
                            }
                            else
 #endif
@ -1069,7 +1069,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                        if (useFP16)
                        {
                            for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
-                                inpbuf_ki_FP16[0] = (float16_t)0.f;
+                                inpbuf_ki_FP16[0] = (__fp16)0.f;
                        }
                        else
 #endif
@ -1257,7 +1257,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
        // works at FP 16.
        CONV_NR = CONV_NR_FP16;
        CONV_MR = CONV_MR_FP16;
-        esz = sizeof(float16_t);
+        esz = sizeof(__fp16);
    }
 #endif

@ -1511,7 +1511,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co

                            char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
                            float *cptr = cbuf_task + stripe * CONV_NR;
-                            float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
+                            hfloat* cptr_f16 = (hfloat*)cbuf_task + stripe*CONV_NR;
                            for (int k = k0_block; k < k1_block; k += CONV_MR,
                                    wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
                            {
@ -1547,7 +1547,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co

                    size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
                    const float *cptr = cbuf_task;
-                    const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
+                    const hfloat *cptr_fp16 = (const hfloat *)cbuf_task;
                    float *outptr = out + outofs;
                    const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;

--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@ -62,10 +62,10 @@ struct FastConv
    float* getWeights();
    float* getWeightsWino();

-    std::vector<float16_t> weightsBuf_FP16;
-    std::vector<float16_t> weightsWinoBuf_FP16;
-    float16_t* getWeightsFP16();
-    float16_t* getWeightsWinoFP16();
+    std::vector<hfloat> weightsBuf_FP16;
+    std::vector<hfloat> weightsWinoBuf_FP16;
+    hfloat* getWeightsFP16();
+    hfloat* getWeightsWinoFP16();

    int conv_type;
    int conv_dim;  // Flag for conv1d, conv2d, or conv3d.
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@ -1742,12 +1742,12 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
 #endif
            const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();

-            AutoBuffer<float16_t, 16> aligned_val;
+            AutoBuffer<hfloat, 16> aligned_val;
            size_t sz = tensor_proto.int32_data().size();
            aligned_val.allocate(sz);
-            float16_t* bufPtr = aligned_val.data();
+            hfloat* bufPtr = aligned_val.data();

-            float16_t *fp16Ptr = (float16_t *)field.data();
+            hfloat *fp16Ptr = (hfloat *)field.data();
            for (int i = 0; i < sz; i++)
            {
                bufPtr[i] = fp16Ptr[i*2 + offset];
@ -1759,11 +1759,11 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
            char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
 #if CV_STRONG_ALIGNMENT
            // Aligned pointer is required.
-            AutoBuffer<float16_t, 16> aligned_val;
-            if (!isAligned<sizeof(float16_t)>(val))
+            AutoBuffer<hfloat, 16> aligned_val;
+            if (!isAligned<sizeof(hfloat)>(val))
            {
                size_t sz = tensor_proto.raw_data().size();
-                aligned_val.allocate(divUp(sz, sizeof(float16_t)));
+                aligned_val.allocate(divUp(sz, sizeof(hfloat)));
                memcpy(aligned_val.data(), val, sz);
                val = (char*)aligned_val.data();
            }