From b59f3989f58bf69b8c1145d1cb45511f7ebd07fe Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 21 Sep 2016 18:13:17 +0300 Subject: [PATCH] fixed some compile warnings in dnn & protobuf; improved convolution layer performance when blas is not available by parallelizing gemmCPU() function in dnn --- .../src/google/protobuf/descriptor.cc | 3 +- .../src/google/protobuf/stubs/strutil.cc | 7 -- .../src/google/protobuf/wire_format.cc | 9 --- modules/dnn/samples/fcn_semsegm.cpp | 3 + modules/dnn/src/caffe/layer_loaders.cpp | 8 +- modules/dnn/src/layers/convolution_layer.cpp | 2 +- modules/dnn/src/layers/op_blas.cpp | 78 +++++++++++++++++++ 7 files changed, 85 insertions(+), 25 deletions(-) diff --git a/modules/dnn/3rdparty/protobuf/src/google/protobuf/descriptor.cc b/modules/dnn/3rdparty/protobuf/src/google/protobuf/descriptor.cc index c941aacca..3516f64a8 100644 --- a/modules/dnn/3rdparty/protobuf/src/google/protobuf/descriptor.cc +++ b/modules/dnn/3rdparty/protobuf/src/google/protobuf/descriptor.cc @@ -223,8 +223,7 @@ struct PointerStringPairHash { } // Used only by MSVC and platforms where hash_map is not available. - static const size_t bucket_size = 4; - static const size_t min_buckets = 8; + enum { bucket_size = 4, min_buckets = 8 }; inline bool operator()(const PointerStringPair& a, const PointerStringPair& b) const { if (a.first < b.first) return true; diff --git a/modules/dnn/3rdparty/protobuf/src/google/protobuf/stubs/strutil.cc b/modules/dnn/3rdparty/protobuf/src/google/protobuf/stubs/strutil.cc index 917b3e9fb..00d1bc633 100644 --- a/modules/dnn/3rdparty/protobuf/src/google/protobuf/stubs/strutil.cc +++ b/modules/dnn/3rdparty/protobuf/src/google/protobuf/stubs/strutil.cc @@ -729,13 +729,6 @@ char *FastHex32ToBuffer(uint32 value, char* buffer) { return InternalFastHexToBuffer(value, buffer, 8); } -static inline char* PlaceNum(char* p, int num, char prev_sep) { - *p-- = '0' + num % 10; - *p-- = '0' + num / 10; - *p-- = prev_sep; - return p; -} - // ---------------------------------------------------------------------- // FastInt32ToBufferLeft() // FastUInt32ToBufferLeft() diff --git a/modules/dnn/3rdparty/protobuf/src/google/protobuf/wire_format.cc b/modules/dnn/3rdparty/protobuf/src/google/protobuf/wire_format.cc index ead763b18..7fe8e37ad 100644 --- a/modules/dnn/3rdparty/protobuf/src/google/protobuf/wire_format.cc +++ b/modules/dnn/3rdparty/protobuf/src/google/protobuf/wire_format.cc @@ -53,15 +53,6 @@ namespace google { namespace protobuf { namespace internal { -namespace { - -// This function turns out to be convenient when using some macros later. -inline int GetEnumNumber(const EnumValueDescriptor* descriptor) { - return descriptor->number(); -} - -} // anonymous namespace - // =================================================================== bool UnknownFieldSetFieldSkipper::SkipField( diff --git a/modules/dnn/samples/fcn_semsegm.cpp b/modules/dnn/samples/fcn_semsegm.cpp index b519e12ca..bdeb75cd2 100755 --- a/modules/dnn/samples/fcn_semsegm.cpp +++ b/modules/dnn/samples/fcn_semsegm.cpp @@ -140,7 +140,10 @@ int main(int argc, char **argv) //! [Set input blob] //! [Make forward pass] + double t = (double)cv::getTickCount(); net.forward(); //compute output + t = (double)cv::getTickCount() - t; + printf("processing time: %.1fms\n", t*1000./getTickFrequency()); //! [Make forward pass] //! [Gather output] diff --git a/modules/dnn/src/caffe/layer_loaders.cpp b/modules/dnn/src/caffe/layer_loaders.cpp index bd3b58592..35ec9082c 100644 --- a/modules/dnn/src/caffe/layer_loaders.cpp +++ b/modules/dnn/src/caffe/layer_loaders.cpp @@ -44,7 +44,7 @@ Ptr createLayerFromCaffe(LayerParams ¶ms) template<> Ptr createLayerFromCaffe(LayerParams ¶ms) { - int type; + int type = PoolingLayer::MAX; Size kernel, stride, pad; bool globalPooling; @@ -60,10 +60,6 @@ Ptr createLayerFromCaffe(LayerParams ¶ms) else CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\""); } - else - { - type = PoolingLayer::MAX; - } getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling, pad.height, pad.width, stride.height, stride.width); //getCaffeConvParams(params, kernel, pad, stride); @@ -107,7 +103,7 @@ Ptr createLayerFromCaffe(LayerParams ¶ms) template<> //LRNLayer specialization Ptr createLayerFromCaffe(LayerParams& params) { - int type; + int type = -1; String nrmType = params.get("norm_region", "ACROSS_CHANNELS"); if (nrmType == "ACROSS_CHANNELS") type = LRNLayer::CHANNEL_NRM; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index efb54a727..860c871c8 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -55,7 +55,7 @@ namespace dnn ConvolutionLayerImpl::ConvolutionLayerImpl() { - tryUseOpenCL = true; + tryUseOpenCL = false; //true; numOutput = -1; group = -1; diff --git a/modules/dnn/src/layers/op_blas.cpp b/modules/dnn/src/layers/op_blas.cpp index 4b16aa11b..375d36563 100644 --- a/modules/dnn/src/layers/op_blas.cpp +++ b/modules/dnn/src/layers/op_blas.cpp @@ -28,8 +28,85 @@ inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans) cols = (isTrans) ? A.rows : A.cols; } + +class GEMMInvoker : public ParallelLoopBody +{ +public: + GEMMInvoker(const Mat* _a, const Mat* _b, double _alpha, Mat* _c, double _beta) + { + a = _a; + b = _b; + c = _c; + alpha = _alpha; + beta = _beta; + } + + void operator()(const Range& range) const + { + int mmax = a->rows; + int nmax = range.end - range.start; + int kmax = a->cols; + int m, n, k; + AutoBuffer buf(nmax); + float* ptr = buf; + if( mmax %2 != 0 ) + memset(ptr, 0, nmax*sizeof(ptr[0])); + + for( m = 0; m < mmax; m += 2 ) + { + float* dst0 = c->ptr(m) + range.start; + float* dst1 = m+1 < mmax ? c->ptr(m+1) + range.start : ptr; + const float* aptr0 = a->ptr(m); + const float* aptr1 = m+1 < mmax ? a->ptr(m+1) : aptr0; + + if( beta != 1 ) + { + if( beta == 0 ) + for( n = 0; n < nmax; n++ ) + { + dst0[n] = 0.f; + dst1[n] = 0.f; + } + else + for( n = 0; n < nmax; n++ ) + { + dst0[n] *= (float)beta; + dst1[n] *= (float)beta; + } + } + + for( k = 0; k < kmax; k++ ) + { + float alpha0 = (float)(alpha*aptr0[k]); + float alpha1 = (float)(alpha*aptr1[k]); + const float* bptr = b->ptr(k) + range.start; + + for( n = 0; n < nmax; n++ ) + { + float d0 = dst0[n] + alpha0*bptr[n]; + float d1 = dst1[n] + alpha1*bptr[n]; + dst0[n] = d0; + dst1[n] = d1; + } + } + } + } + + const Mat *a, *b; + Mat* c; + double alpha, beta; +}; + void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/) { + if( C.type() == CV_32F && flags == 0 ) + { + GEMMInvoker invoker(&A, &B, alpha, &C, beta); + double granularity = 10000000./((double)A.rows*A.cols); + parallel_for_(Range(0, B.cols), invoker, granularity); + } + else + { #if HAVE_CBLAS bool transA = static_cast(flags & GEMM_1_T); bool transB = static_cast(flags & GEMM_2_T); @@ -70,6 +147,7 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int #else cv::gemm(A, B, alpha, C, beta, C, flags); #endif + } } int getBlasThreads()