Merge pull request #25387 from fengyuentau:complete-float16_t-renaming

Rename remaining float16_t for future proof #25387

Resolves comment: https://github.com/opencv/opencv/pull/25217#discussion_r1547733187.

`std::float16_t` and `std::bfloat16_t` are introduced since c++23: https://en.cppreference.com/w/cpp/types/floating-point.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/25397/head
Yuantao Feng 10 months ago committed by GitHub
parent 2c5b296ab2
commit 197626a5bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      modules/core/include/opencv2/core/cvdef.h
  2. 14
      modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
  3. 2
      modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
  4. 13
      modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
  5. 68
      modules/dnn/src/layers/cpu_kernels/convolution.cpp
  6. 8
      modules/dnn/src/layers/cpu_kernels/convolution.hpp
  7. 12
      modules/dnn/src/onnx/onnx_graph_simplifier.cpp

@ -900,7 +900,9 @@ inline hfloat hfloatFromBits(ushort w) {
#endif
}
#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON)
typedef hfloat float16_t;
#endif
}
#endif

@ -494,10 +494,9 @@ void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const
void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
const int convMR_fp16, const int convNR_fp16)
{
typedef __fp16 float16_t;
const float16_t* a = (const float16_t*)_a;
const float16_t* b = (const float16_t*)_b;
float16_t* c = (float16_t*)_c;
const __fp16* a = (const __fp16*)_a;
const __fp16* b = (const __fp16*)_b;
__fp16* c = (__fp16*)_c;
CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);
float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
@ -638,12 +637,11 @@ void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc,
void convBlockMR1_F16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
{
typedef __fp16 float16_t;
CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
const float16_t* a = (const float16_t*)_a;
const float16_t* b = (const float16_t*)_b;
const __fp16* a = (const __fp16*)_a;
const __fp16* b = (const __fp16*)_b;
const float16_t bias = (float16_t)_bias;
const __fp16 bias = (__fp16)_bias;
float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;

@ -85,7 +85,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
// works at FP 16.
CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
esz = sizeof(float16_t);
esz = sizeof(__fp16);
}
#endif

@ -435,10 +435,9 @@ void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock,
const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16)
{
typedef __fp16 float16_t;
const float16_t* inwptr = (const float16_t*)_inwptr;
const float16_t* wptr = (const float16_t*)_wptr;
float16_t* outbuf = (float16_t*)_outbuf;
const __fp16* inwptr = (const __fp16*)_inwptr;
const __fp16* wptr = (const __fp16*)_wptr;
__fp16* outbuf = (__fp16*)_outbuf;
CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF16 == 8);
@ -591,8 +590,7 @@ void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, i
void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep,
char * _outptr, int Cg, const int winoIblock, const int winoAtomF16)
{
typedef __fp16 float16_t;
float16_t* outptr = (float16_t*)_outptr;
__fp16* outptr = (__fp16*)_outptr;
float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
@ -757,8 +755,7 @@ void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep,
float * bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
typedef __fp16 float16_t;
const float16_t* inptr = (const float16_t*)_inptr;
const __fp16* inptr = (const __fp16*)_inptr;
float32x4_t x00 = vcvt_f32_f16(vld1_f16(inptr)), x01 = vcvt_f32_f16(vld1_f16(inptr + 4));
float32x4_t x10 = vcvt_f32_f16(vld1_f16(inptr + inpstep)), x11 = vcvt_f32_f16(vld1_f16(inptr + inpstep + 4));

@ -26,7 +26,7 @@ void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const fl
#ifdef CONV_ARM_FP16
// Fast convert float 32 to float16
static inline void _cvt32f16f(const float* src, float16_t* dst, int len)
static inline void _cvt32f16f(const float* src, __fp16* dst, int len)
{
int j = 0;
const int VECSZ = 4;
@ -60,7 +60,7 @@ static inline void _cvt32f16f(const float* src, float16_t* dst, int len)
vst1_f16(dst_FP16 + j, hv);
}
for( ; j < len; j++ )
dst[j] = float16_t(src[j]);
dst[j] = __fp16(src[j]);
}
#endif
@ -74,12 +74,12 @@ float* FastConv::getWeightsWino()
return alignPtr(weightsWinoBuf.data(), VEC_ALIGN);
}
float16_t* FastConv::getWeightsFP16()
hfloat* FastConv::getWeightsFP16()
{
return alignPtr(weightsBuf_FP16.data(), VEC_ALIGN);
}
float16_t* FastConv::getWeightsWinoFP16()
hfloat* FastConv::getWeightsWinoFP16()
{
return alignPtr(weightsWinoBuf_FP16.data(), VEC_ALIGN);
}
@ -209,7 +209,7 @@ Ptr<FastConv> initFastConv(
if (conv->useFP16)
{
conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
auto weightsPtr_FP16 = conv->getWeightsFP16();
auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
parallel_for_(Range(0, C), [&](const Range& r0){
for(int c = r0.start; c < r0.end; c++)
@ -269,11 +269,11 @@ Ptr<FastConv> initFastConv(
float* wptrWino = nullptr;
#ifdef CONV_ARM_FP16
float16_t* wptrWino_FP16 = nullptr;
__fp16* wptrWino_FP16 = nullptr;
if (conv->useFP16)
{
conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
wptrWino_FP16 = conv->getWeightsWinoFP16();
wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16();
}
else
#endif
@ -323,7 +323,7 @@ Ptr<FastConv> initFastConv(
#ifdef CONV_ARM_FP16
if (conv->useFP16)
{
float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
__fp16* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
(c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
@ -331,7 +331,7 @@ Ptr<FastConv> initFastConv(
CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights);
for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
{
wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j];
}
}
}
@ -367,12 +367,12 @@ Ptr<FastConv> initFastConv(
int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
float16_t* weightsPtr_FP16 = nullptr;
__fp16* weightsPtr_FP16 = nullptr;
if (conv->useFP16)
{
conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
weightsPtr_FP16 = conv->getWeightsFP16();
weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
}
else
#endif
@ -394,7 +394,7 @@ Ptr<FastConv> initFastConv(
int startK = si * CONV_MR_FP16;
CV_Assert(startK < Kg_aligned_FP16);
float16_t* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
__fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
int k_idx = g*Kg + startK;
@ -405,9 +405,9 @@ Ptr<FastConv> initFastConv(
const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
int k = 0;
for(; k < dk; k++, wptr += wstep)
packed_wptr[k] = (float16_t)(*wptr);
packed_wptr[k] = (__fp16)(*wptr);
for(; k < CONV_MR_FP16; k++)
packed_wptr[k] = (float16_t)0.f;
packed_wptr[k] = (__fp16)0.f;
}
}
}});
@ -467,8 +467,8 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0,
float* inptrInC = (float* )inptrIn;
#ifdef CONV_ARM_FP16
float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
if (esz == sizeof(float16_t))
__fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
if (esz == sizeof(__fp16))
{
if (stride_w == 1)
{
@ -565,16 +565,16 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0
float* inptrInC = inptrIn;
#ifdef CONV_ARM_FP16
float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
if (esz == sizeof(float16_t))
__fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
if (esz == sizeof(__fp16))
{
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
inpbufC_FP16[k*CONV_NR_FP16] = (__fp16)v0;
inpbufC_FP16[k*CONV_NR_FP16+1] = (__fp16)v1;
}
} else
#endif
@ -630,7 +630,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
if (useFP16)
{
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
_cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
_cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR);
}
else
#endif
@ -644,7 +644,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
{
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
{
_cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
_cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len);
}
}
else
@ -704,11 +704,11 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t *)inpbuf + s0;
__fp16* inpbufC = (__fp16 *)inpbuf + s0;
for (int w = w0; w < w1; w++)
{
int imgofs = w*dilation_w;
inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
inpbufC[w*CONV_NR] = (__fp16)inptrInC[imgofs];
}
}
else
@ -765,14 +765,14 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t *)inpbuf + s0;
__fp16* inpbufC = (__fp16 *)inpbuf + s0;
for (int h = h0; h < h1; h++)
{
for (int w = w0; w < w1; w++)
{
int imgofs = h*(dilation_h*Wi) + w*dilation_w;
inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
inpbufC[(h*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
}
}
}
@ -838,7 +838,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
#ifdef CONV_ARM_FP16
if (useFP16)
{
float16_t* inpbufC = (float16_t* )inpbuf + s0;
__fp16* inpbufC = (__fp16* )inpbuf + s0;
for ( int d = d0; d < d1; d++)
{
@ -847,7 +847,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
for (int w = w0; w < w1; w++)
{
int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
}
}
}
@ -889,7 +889,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
{
float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
#ifdef CONV_ARM_FP16
float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
__fp16 * inpbuf_ki_FP16 = (__fp16 *)inpbuf + k * CONV_NR * Cg + i;
#endif
int zi = z0 * stride_d + dz - pad_front;
@ -1053,7 +1053,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
inpbuf_ki_FP16[0] = (__fp16)(*inptr_ki);
}
else
#endif
@ -1069,7 +1069,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
if (useFP16)
{
for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
inpbuf_ki_FP16[0] = (float16_t)0.f;
inpbuf_ki_FP16[0] = (__fp16)0.f;
}
else
#endif
@ -1257,7 +1257,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
// works at FP 16.
CONV_NR = CONV_NR_FP16;
CONV_MR = CONV_MR_FP16;
esz = sizeof(float16_t);
esz = sizeof(__fp16);
}
#endif
@ -1511,7 +1511,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
float *cptr = cbuf_task + stripe * CONV_NR;
float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
hfloat* cptr_f16 = (hfloat*)cbuf_task + stripe*CONV_NR;
for (int k = k0_block; k < k1_block; k += CONV_MR,
wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
{
@ -1547,7 +1547,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
const float *cptr = cbuf_task;
const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
const hfloat *cptr_fp16 = (const hfloat *)cbuf_task;
float *outptr = out + outofs;
const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;

@ -62,10 +62,10 @@ struct FastConv
float* getWeights();
float* getWeightsWino();
std::vector<float16_t> weightsBuf_FP16;
std::vector<float16_t> weightsWinoBuf_FP16;
float16_t* getWeightsFP16();
float16_t* getWeightsWinoFP16();
std::vector<hfloat> weightsBuf_FP16;
std::vector<hfloat> weightsWinoBuf_FP16;
hfloat* getWeightsFP16();
hfloat* getWeightsWinoFP16();
int conv_type;
int conv_dim; // Flag for conv1d, conv2d, or conv3d.

@ -1742,12 +1742,12 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
#endif
const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
AutoBuffer<float16_t, 16> aligned_val;
AutoBuffer<hfloat, 16> aligned_val;
size_t sz = tensor_proto.int32_data().size();
aligned_val.allocate(sz);
float16_t* bufPtr = aligned_val.data();
hfloat* bufPtr = aligned_val.data();
float16_t *fp16Ptr = (float16_t *)field.data();
hfloat *fp16Ptr = (hfloat *)field.data();
for (int i = 0; i < sz; i++)
{
bufPtr[i] = fp16Ptr[i*2 + offset];
@ -1759,11 +1759,11 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
#if CV_STRONG_ALIGNMENT
// Aligned pointer is required.
AutoBuffer<float16_t, 16> aligned_val;
if (!isAligned<sizeof(float16_t)>(val))
AutoBuffer<hfloat, 16> aligned_val;
if (!isAligned<sizeof(hfloat)>(val))
{
size_t sz = tensor_proto.raw_data().size();
aligned_val.allocate(divUp(sz, sizeof(float16_t)));
aligned_val.allocate(divUp(sz, sizeof(hfloat)));
memcpy(aligned_val.data(), val, sz);
val = (char*)aligned_val.data();
}

Loading…
Cancel
Save