diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index f9cbaf4520..4e65c38df7 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -2744,8 +2744,7 @@ struct Net::Impl : public detail::NetImplBase // (and so we eliminate the concatenation layer, because the channels // are concatenated implicitly). Ptr concatLayer = ld.layerInstance.dynamicCast(); - if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding && - ld.outputBlobs.size() == 1 ) + if( !concatLayer.empty() && !concatLayer->padding && ld.outputBlobs.size() == 1 ) { Mat& output = ld.outputBlobs[0]; UMat umat_output; @@ -2782,7 +2781,8 @@ struct Net::Impl : public detail::NetImplBase // the concatenation optimization is applied with batch_size > 1. // so, for now, we only apply this optimization in the most popular // case batch_size == 1. - if( output.dims == 4 && output.size[0] == 1 ) + int axis = clamp(concatLayer->axis, output.dims); + if( output.total(0, axis) == 1 ) { size_t i, ninputs = ld.inputBlobsId.size(); std::vector realinputs(ninputs); @@ -2836,18 +2836,20 @@ struct Net::Impl : public detail::NetImplBase OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats); } #endif + #ifdef HAVE_CUDA if (preferableBackend == DNN_BACKEND_CUDA) ld.outputBlobsWrappers[0] = wrap(output); #endif - Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() }; + std::vector chrange(output.dims, Range::all()); + int ofs = 0; for( i = 0; i < ninputs; i++ ) { LayerPin pin = realinputs[i]; LayerData* inp_i_data = &layers[pin.lid]; - int channels_i = ld.inputBlobs[i]->size[1]; - chrange[1] = Range(ofs, ofs + channels_i); + int channels_i = ld.inputBlobs[i]->size[axis]; + chrange[axis] = Range(ofs, ofs + channels_i); printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(), pin.oid, ofs, ofs + channels_i)); ofs += channels_i; diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp index c0f9b5b1e2..fefed638c5 100644 --- a/modules/features2d/src/sift.simd.hpp +++ b/modules/features2d/src/sift.simd.hpp @@ -167,9 +167,23 @@ float calcOrientationHist( int i, j, k, len = (radius*2+1)*(radius*2+1); float expf_scale = -1.f/(2.f * sigma * sigma); +#if CV_SIMD + AutoBuffer bufX(len + v_float32::nlanes); + AutoBuffer bufY(len + v_float32::nlanes); + AutoBuffer bufO(len + v_float32::nlanes); + AutoBuffer bufW(len + v_float32::nlanes); + AutoBuffer bufT(n+4 + v_float32::nlanes); + float *X = alignPtr(bufX.data(), CV_SIMD_WIDTH); + float *Y = alignPtr(bufY.data(), CV_SIMD_WIDTH); + float *Mag = X; + float *Ori = alignPtr(bufO.data(), CV_SIMD_WIDTH); + float *W = alignPtr(bufW.data(), CV_SIMD_WIDTH); + float *temphist = alignPtr(bufT.data(), CV_SIMD_WIDTH)+2; +#else AutoBuffer buf(len*4 + n+4); float *X = buf.data(), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len; float* temphist = W + len + 2; +#endif for( i = 0; i < n; i++ ) temphist[i] = 0.f; @@ -201,32 +215,29 @@ float calcOrientationHist( cv::hal::magnitude32f(X, Y, Mag, len); k = 0; -#if CV_AVX2 +#if CV_SIMD + const int vecsize = v_float32::nlanes; + v_float32 nd360 = vx_setall_f32(n/360.f); + v_int32 __n = vx_setall_s32(n); + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize]; + + for( ; k <= len - vecsize; k += vecsize ) { - __m256 __nd360 = _mm256_set1_ps(n/360.f); - __m256i __n = _mm256_set1_epi32(n); - int CV_DECL_ALIGNED(32) bin_buf[8]; - float CV_DECL_ALIGNED(32) w_mul_mag_buf[8]; - for ( ; k <= len - 8; k+=8 ) + v_float32 w = vx_load_aligned( W + k ); + v_float32 mag = vx_load_aligned( Mag + k ); + v_float32 ori = vx_load_aligned( Ori + k ); + v_int32 bin = v_round( nd360 * ori ); + + bin = v_select(bin >= __n, bin - __n, bin); + bin = v_select(bin < vx_setzero_s32(), bin + __n, bin); + + w = w * mag; + v_store_aligned(bin_buf, bin); + v_store_aligned(w_mul_mag_buf, w); + for(int vi = 0; vi < vecsize; vi++) { - __m256i __bin = _mm256_cvtps_epi32(_mm256_mul_ps(__nd360, _mm256_loadu_ps(&Ori[k]))); - - __bin = _mm256_sub_epi32(__bin, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __bin), __n)); - __bin = _mm256_add_epi32(__bin, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __bin))); - - __m256 __w_mul_mag = _mm256_mul_ps(_mm256_loadu_ps(&W[k]), _mm256_loadu_ps(&Mag[k])); - - _mm256_store_si256((__m256i *) bin_buf, __bin); - _mm256_store_ps(w_mul_mag_buf, __w_mul_mag); - - temphist[bin_buf[0]] += w_mul_mag_buf[0]; - temphist[bin_buf[1]] += w_mul_mag_buf[1]; - temphist[bin_buf[2]] += w_mul_mag_buf[2]; - temphist[bin_buf[3]] += w_mul_mag_buf[3]; - temphist[bin_buf[4]] += w_mul_mag_buf[4]; - temphist[bin_buf[5]] += w_mul_mag_buf[5]; - temphist[bin_buf[6]] += w_mul_mag_buf[6]; - temphist[bin_buf[7]] += w_mul_mag_buf[7]; + temphist[bin_buf[vi]] += w_mul_mag_buf[vi]; } } #endif @@ -247,34 +258,20 @@ float calcOrientationHist( temphist[n+1] = temphist[1]; i = 0; -#if CV_AVX2 +#if CV_SIMD + v_float32 d_1_16 = vx_setall_f32(1.f/16.f); + v_float32 d_4_16 = vx_setall_f32(4.f/16.f); + v_float32 d_6_16 = vx_setall_f32(6.f/16.f); + for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes ) { - __m256 __d_1_16 = _mm256_set1_ps(1.f/16.f); - __m256 __d_4_16 = _mm256_set1_ps(4.f/16.f); - __m256 __d_6_16 = _mm256_set1_ps(6.f/16.f); - for( ; i <= n - 8; i+=8 ) - { -#if CV_FMA3 - __m256 __hist = _mm256_fmadd_ps( - _mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])), - __d_1_16, - _mm256_fmadd_ps( - _mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])), - __d_4_16, - _mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16))); -#else - __m256 __hist = _mm256_add_ps( - _mm256_mul_ps( - _mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])), - __d_1_16), - _mm256_add_ps( - _mm256_mul_ps( - _mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])), - __d_4_16), - _mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16))); -#endif - _mm256_storeu_ps(&hist[i], __hist); - } + v_float32 tn2 = vx_load_aligned(temphist + i-2); + v_float32 tn1 = vx_load(temphist + i-1); + v_float32 t0 = vx_load(temphist + i); + v_float32 t1 = vx_load(temphist + i+1); + v_float32 t2 = vx_load(temphist + i+2); + v_float32 _hist = v_fma(tn2 + t2, d_1_16, + v_fma(tn1 + t1, d_4_16, t0 * d_6_16)); + v_store(hist + i, _hist); } #endif for( ; i < n; i++ ) @@ -623,91 +620,65 @@ void calcSIFTDescriptor( cv::hal::exp32f(W, W, len); k = 0; -#if CV_AVX2 +#if CV_SIMD { - int CV_DECL_ALIGNED(32) idx_buf[8]; - float CV_DECL_ALIGNED(32) rco_buf[64]; - const __m256 __ori = _mm256_set1_ps(ori); - const __m256 __bins_per_rad = _mm256_set1_ps(bins_per_rad); - const __m256i __n = _mm256_set1_epi32(n); - for( ; k <= len - 8; k+=8 ) + const int vecsize = v_float32::nlanes; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize]; + const v_float32 __ori = vx_setall_f32(ori); + const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad); + const v_int32 __n = vx_setall_s32(n); + const v_int32 __1 = vx_setall_s32(1); + const v_int32 __d_plus_2 = vx_setall_s32(d+2); + const v_int32 __n_plus_2 = vx_setall_s32(n+2); + for( ; k <= len - vecsize; k += vecsize ) { - __m256 __rbin = _mm256_loadu_ps(&RBin[k]); - __m256 __cbin = _mm256_loadu_ps(&CBin[k]); - __m256 __obin = _mm256_mul_ps(_mm256_sub_ps(_mm256_loadu_ps(&Ori[k]), __ori), __bins_per_rad); - __m256 __mag = _mm256_mul_ps(_mm256_loadu_ps(&Mag[k]), _mm256_loadu_ps(&W[k])); - - __m256 __r0 = _mm256_floor_ps(__rbin); - __rbin = _mm256_sub_ps(__rbin, __r0); - __m256 __c0 = _mm256_floor_ps(__cbin); - __cbin = _mm256_sub_ps(__cbin, __c0); - __m256 __o0 = _mm256_floor_ps(__obin); - __obin = _mm256_sub_ps(__obin, __o0); - - __m256i __o0i = _mm256_cvtps_epi32(__o0); - __o0i = _mm256_add_epi32(__o0i, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __o0i))); - __o0i = _mm256_sub_epi32(__o0i, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __o0i), __n)); - - __m256 __v_r1 = _mm256_mul_ps(__mag, __rbin); - __m256 __v_r0 = _mm256_sub_ps(__mag, __v_r1); - - __m256 __v_rc11 = _mm256_mul_ps(__v_r1, __cbin); - __m256 __v_rc10 = _mm256_sub_ps(__v_r1, __v_rc11); - - __m256 __v_rc01 = _mm256_mul_ps(__v_r0, __cbin); - __m256 __v_rc00 = _mm256_sub_ps(__v_r0, __v_rc01); - - __m256 __v_rco111 = _mm256_mul_ps(__v_rc11, __obin); - __m256 __v_rco110 = _mm256_sub_ps(__v_rc11, __v_rco111); - - __m256 __v_rco101 = _mm256_mul_ps(__v_rc10, __obin); - __m256 __v_rco100 = _mm256_sub_ps(__v_rc10, __v_rco101); - - __m256 __v_rco011 = _mm256_mul_ps(__v_rc01, __obin); - __m256 __v_rco010 = _mm256_sub_ps(__v_rc01, __v_rco011); - - __m256 __v_rco001 = _mm256_mul_ps(__v_rc00, __obin); - __m256 __v_rco000 = _mm256_sub_ps(__v_rc00, __v_rco001); - - __m256i __one = _mm256_set1_epi32(1); - __m256i __idx = _mm256_add_epi32( - _mm256_mullo_epi32( - _mm256_add_epi32( - _mm256_mullo_epi32(_mm256_add_epi32(_mm256_cvtps_epi32(__r0), __one), _mm256_set1_epi32(d + 2)), - _mm256_add_epi32(_mm256_cvtps_epi32(__c0), __one)), - _mm256_set1_epi32(n + 2)), - __o0i); - - _mm256_store_si256((__m256i *)idx_buf, __idx); - - _mm256_store_ps(&(rco_buf[0]), __v_rco000); - _mm256_store_ps(&(rco_buf[8]), __v_rco001); - _mm256_store_ps(&(rco_buf[16]), __v_rco010); - _mm256_store_ps(&(rco_buf[24]), __v_rco011); - _mm256_store_ps(&(rco_buf[32]), __v_rco100); - _mm256_store_ps(&(rco_buf[40]), __v_rco101); - _mm256_store_ps(&(rco_buf[48]), __v_rco110); - _mm256_store_ps(&(rco_buf[56]), __v_rco111); - #define HIST_SUM_HELPER(id) \ - hist[idx_buf[(id)]] += rco_buf[(id)]; \ - hist[idx_buf[(id)]+1] += rco_buf[8 + (id)]; \ - hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)]; \ - hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)]; \ - hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)]; \ - hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \ - hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)]; \ - hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)]; - - HIST_SUM_HELPER(0); - HIST_SUM_HELPER(1); - HIST_SUM_HELPER(2); - HIST_SUM_HELPER(3); - HIST_SUM_HELPER(4); - HIST_SUM_HELPER(5); - HIST_SUM_HELPER(6); - HIST_SUM_HELPER(7); - - #undef HIST_SUM_HELPER + v_float32 rbin = vx_load(RBin + k); + v_float32 cbin = vx_load(CBin + k); + v_float32 obin = (vx_load(Ori + k) - __ori) * __bins_per_rad; + v_float32 mag = vx_load(Mag + k) * vx_load(W + k); + + v_int32 r0 = v_floor(rbin); + v_int32 c0 = v_floor(cbin); + v_int32 o0 = v_floor(obin); + rbin -= v_cvt_f32(r0); + cbin -= v_cvt_f32(c0); + obin -= v_cvt_f32(o0); + + o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0); + o0 = v_select(o0 >= __n, o0 - __n, o0); + + v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1; + v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11; + v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01; + v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111; + v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101; + v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011; + v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001; + + v_int32 idx = v_fma(v_fma(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0); + v_store_aligned(idx_buf, idx); + + v_store_aligned(rco_buf, v_rco000); + v_store_aligned(rco_buf+vecsize, v_rco001); + v_store_aligned(rco_buf+vecsize*2, v_rco010); + v_store_aligned(rco_buf+vecsize*3, v_rco011); + v_store_aligned(rco_buf+vecsize*4, v_rco100); + v_store_aligned(rco_buf+vecsize*5, v_rco101); + v_store_aligned(rco_buf+vecsize*6, v_rco110); + v_store_aligned(rco_buf+vecsize*7, v_rco111); + + for(int id = 0; id < vecsize; id++) + { + hist[idx_buf[id]] += rco_buf[id]; + hist[idx_buf[id]+1] += rco_buf[vecsize + id]; + hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id]; + hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id]; + hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id]; + hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id]; + hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id]; + hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id]; + } } } #endif @@ -766,23 +737,16 @@ void calcSIFTDescriptor( float nrm2 = 0; len = d*d*n; k = 0; -#if CV_AVX2 +#if CV_SIMD { - float CV_DECL_ALIGNED(32) nrm2_buf[8]; - __m256 __nrm2 = _mm256_setzero_ps(); - __m256 __dst; - for( ; k <= len - 8; k += 8 ) + v_float32 __nrm2 = vx_setzero_f32(); + v_float32 __dst; + for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes ) { - __dst = _mm256_loadu_ps(&dst[k]); -#if CV_FMA3 - __nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2); -#else - __nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst)); -#endif + __dst = vx_load(dst + k); + __nrm2 = v_fma(__dst, __dst, __nrm2); } - _mm256_store_ps(nrm2_buf, __nrm2); - nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] + - nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7]; + nrm2 = (float)v_reduce_sum(__nrm2); } #endif for( ; k < len; k++ ) @@ -795,7 +759,7 @@ void calcSIFTDescriptor( // This code cannot be enabled because it sums nrm2 in a different order, // thus producing slightly different results { - float CV_DECL_ALIGNED(32) nrm2_buf[8]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8]; __m256 __dst; __m256 __nrm2 = _mm256_setzero_ps(); __m256 __thr = _mm256_set1_ps(thr); @@ -825,17 +789,17 @@ void calcSIFTDescriptor( #if 1 k = 0; -#if CV_AVX2 +#if CV_SIMD { - __m256 __dst; - __m256 __min = _mm256_setzero_ps(); - __m256 __max = _mm256_set1_ps(255.0f); // max of uchar - __m256 __nrm2 = _mm256_set1_ps(nrm2); - for( k = 0; k <= len - 8; k+=8 ) + v_float32 __dst; + v_float32 __min = vx_setzero_f32(); + v_float32 __max = vx_setall_f32(255.0f); // max of uchar + v_float32 __nrm2 = vx_setall_f32(nrm2); + for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes ) { - __dst = _mm256_loadu_ps(&dst[k]); - __dst = _mm256_min_ps(_mm256_max_ps(_mm256_round_ps(_mm256_mul_ps(__dst, __nrm2), _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC), __min), __max); - _mm256_storeu_ps(&dst[k], __dst); + __dst = vx_load(dst + k); + __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max); + v_store(dst + k, __dst); } } #endif diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h index a52166d3c4..c068e184bb 100644 --- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h +++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h @@ -547,7 +547,7 @@ public: void findNeighbors(ResultSet& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE { - int maxChecks = get_param(searchParams,"checks",32); + const int maxChecks = get_param(searchParams,"checks",32); // Priority queue storing intermediate branches in the best-bin-first search Heap* heap = new Heap((int)size_); @@ -556,6 +556,8 @@ public: int checks = 0; for (int i=0; i= maxChecks) && result.full()) + break; } BranchSt branch; @@ -747,8 +749,8 @@ private: Heap* heap, std::vector& checked) { if (node->childs==NULL) { - if (checks>=maxChecks) { - if (result.full()) return; + if ((checks>=maxChecks) && result.full()) { + return; } for (int i=0; isize; ++i) { int index = node->indices[i]; diff --git a/modules/imgproc/perf/opencl/perf_filters.cpp b/modules/imgproc/perf/opencl/perf_filters.cpp index b4e29ae67f..a179f7d009 100644 --- a/modules/imgproc/perf/opencl/perf_filters.cpp +++ b/modules/imgproc/perf/opencl/perf_filters.cpp @@ -313,6 +313,62 @@ OCL_PERF_TEST_P(Filter2DFixture, Filter2D, SANITY_CHECK(dst, eps); } +///////////// SepFilter2D ///////////// + +typedef FilterFixture OCL_SepFilter2D; + +PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D) +{ + const FilterParams& params = GetParam(); + const Size srcSize = get<0>(params); + const int type = get<1>(params), ksize = get<2>(params); + + checkDeviceMaxMemoryAllocSize(srcSize, type); + + UMat src(srcSize, type), dst(srcSize, type); + declare.in(src, WARMUP_RNG).out(dst); + + Mat kernelX(1, ksize, CV_32FC1); + randu(kernelX, -3.0, 3.0); + Mat kernelY(1, ksize, CV_32FC1); + randu(kernelY, -3.0, 3.0); + + OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT); + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D_BitExact) +{ + const FilterParams& params = GetParam(); + const Size srcSize = get<0>(params); + const int type = get<1>(params), ksize = get<2>(params); + + checkDeviceMaxMemoryAllocSize(srcSize, type); + + UMat src(srcSize, type), dst(srcSize, type); + declare.in(src, WARMUP_RNG).out(dst); + + Mat kernelX(1, ksize, CV_32SC1); + randu(kernelX, -16.0, 16.0); + kernelX.convertTo(kernelX, CV_32FC1, 1/16.0f, 0); + Mat kernelY(1, ksize, CV_32SC1); + randu(kernelY, -16.0, 16.0); + kernelY.convertTo(kernelY, CV_32FC1, 1/16.0f, 0); + + OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT); + + SANITY_CHECK_NOTHING(); +} + +INSTANTIATE_TEST_CASE_P(/*nothing*/, OCL_SepFilter2D, + ::testing::Combine( + ::testing::Values(sz1080p), + OCL_TEST_TYPES, + OCL_PERF_ENUM(3, 5, 7, 9, 11) + ) +); + ///////////// Bilateral //////////////////////// typedef TestBaseWithParam BilateralFixture; diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp index d39c749121..c9d1bb457c 100644 --- a/modules/imgproc/src/filter.dispatch.cpp +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, return k.run(2, globalsize, localsize, false); } -const int shift_bits = 8; - static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1, bool int_arithm) + int borderType, int ddepth, bool fast8uc1, + bool int_arithm, int shift_bits) { + CV_Assert(shift_bits == 0 || int_arithm); + int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; Size bufSize = buf.size(); @@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, + bool int_arithm, int shift_bits) { + CV_Assert(shift_bits == 0 || int_arithm); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (dst.depth() == CV_64F && !doubleSupport) return false; @@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; - char cvt[40]; + char cvt[2][40]; + int floatT = std::max(CV_32F, bdepth); cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s" + " -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s" " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf_type), ocl::typeToStr(dtype), - ocl::convertTypeStr(bdepth, ddepth, cn, cvt), + ocl::convertTypeStr(bdepth, floatT, cn, cvt[0]), + ocl::typeToStr(CV_MAKETYPE(floatT, cn)), + ocl::convertTypeStr(shift_bits ? floatT : bdepth, ddepth, cn, cvt[1]), ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", int_arithm ? " -D INTEGER_ARITHMETIC" : ""); @@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY return false; k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); + static_cast(delta * (1u << (2 * shift_bits)))); return k.run(2, globalsize, localsize, false); } @@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth = 16; const int optimizedSepFilterLocalHeight = 8; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth, int bdepth, bool int_arithm) + const Mat& kernelX_, const Mat& kernelY_, + double delta, int borderType, int ddepth, int bdepth, + bool int_arithm, int shift_bits) { - Size size = _src.size(), wholeSize; - Point origin; + //CV_Assert(shift_bits == 0 || int_arithm); + + const ocl::Device& d = ocl::Device::getDefault(); + + Size size = _src.size(); int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), dtype = CV_MAKE_TYPE(ddepth, cn); size_t src_step = _src.step(), src_offset = _src.offset(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + + bool doubleSupport = d.doubleFPConfig() > 0; if (esz == 0 || src_step == 0 || (src_offset % src_step) % esz != 0 @@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, || borderType == BORDER_REFLECT_101)) return false; + Mat kernelX, kernelY; + kernelX_.convertTo(kernelX, wdepth); + if (kernelX_.data != kernelY_.data) + kernelY_.convertTo(kernelY, wdepth); + else + kernelY = kernelX; + size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; @@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", - (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), + (int)lt2[0], (int)lt2[1], kernelX.cols / 2, kernelY.cols / 2, + ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], @@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, _dst.create(size, dtype); UMat dst = _dst.getUMat(); - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); + // TODO Future: emit error on inplace processing + //CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat"); + if (src.u == dst.u) + { + CV_LOG_ONCE_WARNING(NULL, "sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning."); + src = src.clone(); + } + Size wholeSize; + Point origin; src.locateROI(wholeSize, origin); - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, origin.x, origin.y, wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); + static_cast(delta * (1u << (2 * shift_bits)))); return k.run(2, gt2, lt2, false); } -bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) +bool ocl_sepFilter2D( + InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType +) { const ocl::Device & d = ocl::Device::getDefault(); Size imgSize = _src.size(); @@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (anchor.y < 0) anchor.y = kernelY.cols >> 1; - int rtype = getKernelType(kernelX, - kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); - int ctype = getKernelType(kernelY, - kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); - int bdepth = CV_32F; bool int_arithm = false; - if( sdepth == CV_8U && ddepth == CV_8U && - rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) + int shift_bits = 0; + + while (sdepth == CV_8U && ddepth == CV_8U) { - if (ocl::Device::getDefault().isIntel()) + int bits_ = 8; + if (delta * 256.0f != (float)(int)(delta * 256)) { - for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); - if (kernelX.data != kernelY.data) - for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); - } else + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta); + break; + } + Mat kernelX_BitExact, kernelY_BitExact; + bool isValidBitExactRowKernel = createBitExactKernel_32S(kernelX, kernelX_BitExact, bits_); + bool isValidBitExactColumnKernel = createBitExactKernel_32S(kernelY, kernelY_BitExact, bits_); + if (!isValidBitExactRowKernel) + { + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total()); + } + else if (!isValidBitExactColumnKernel) + { + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total()); + } + else { bdepth = CV_32S; - kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); - kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); + shift_bits = bits_; + int_arithm = true; + + kernelX = kernelX_BitExact; + kernelY = kernelY_BitExact; } - int_arithm = true; + break; } - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalWidth + anchor.x && - imgSize.height > optimizedSepFilterLocalHeight + anchor.y && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && - anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && - OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) + CV_OCL_RUN_( + kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT + ocl_sepFilter2D_SinglePass( + _src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth, + CV_32F, // force FP32 mode + false, shift_bits + ), + true + ); UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; + bool fast8uc1 = false; + if (type == CV_8UC1) + { + Size srcWholeSize; + Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + fast8uc1 = srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; + } + + Size srcSize = src.size(); + Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); + UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm, shift_bits)) + return false; + + _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm, shift_bits); +} + +bool ocl_sepFilter2D_BitExact( + InputArray _src, OutputArray _dst, int ddepth, + const Size& ksize, + const uint16_t *fkx, const uint16_t *fky, + Point anchor, + double delta, int borderType, + int shift_bits +) +{ + const ocl::Device & d = ocl::Device::getDefault(); + Size imgSize = _src.size(); + + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + if (cn > 4) + return false; + + if (ksize.width % 2 != 1) + return false; + if (ksize.height % 2 != 1) + return false; + + Mat kernelX(1, ksize.width, CV_16SC1, (void*)fkx); + Mat kernelY(1, ksize.height, CV_16SC1, (void*)fky); + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = kernelX.cols >> 1; + if (anchor.y < 0) + anchor.y = kernelY.cols >> 1; + + int bdepth = sdepth == CV_8U ? CV_32S : CV_32F; + + CV_OCL_RUN_( + kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT + ocl_sepFilter2D_SinglePass( + _src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth, bdepth, + true, shift_bits + ), + true + ); + + UMat src = _src.getUMat(); + + bool fast8uc1 = false; + if (type == CV_8UC1) + { + Size srcWholeSize; + Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + fast8uc1 = srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; + } Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, true, shift_bits)) return false; _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, true, shift_bits); } #endif @@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, CV_Assert(!_kernelX.empty()); CV_Assert(!_kernelY.empty()); - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() >= _kernelY.total() && (size_t)_src.cols() >= _kernelX.total(), ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 7b792d1935..570fecec17 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -46,13 +46,25 @@ namespace cv { #ifdef HAVE_OPENCL - bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ); +bool ocl_sepFilter2D( + InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType +); + +bool ocl_sepFilter2D_BitExact( + InputArray _src, OutputArray _dst, int ddepth, + const Size& ksize, + const uint16_t *fkx, const uint16_t *fky, + Point anchor, + double delta, int borderType, + int shift_bits +); #endif - void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); -} +void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); + +} // namespace #endif diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index afcdbea89c..f2024db334 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -61,7 +61,11 @@ #endif #define DIG(a) a, +#if defined(INTEGER_ARITHMETIC) +__constant int mat_kernel[] = { COEFF }; +#else __constant srcT1 mat_kernel[] = { COEFF }; +#endif __kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -92,30 +96,28 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse barrier(CLK_LOCAL_MEM_FENCE); // read pixels from lds and calculate the result - sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY]; + sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY] + (srcT)delta; for (int i = 1; i <= RADIUSY; ++i) { temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); #else sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); #endif } -#ifdef INTEGER_ARITHMETIC -#ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); -#else - sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; -#endif -#endif - // write the result to dst if (x < dst_cols && y < dst_rows) { +#if defined(SHIFT_BITS) && SHIFT_BITS > 0 + dstT result = convertToDstT(convertToFloatT(sum) * (floatT)(1.0f / (1 << SHIFT_BITS))); +#else + dstT result = convertToDstT(sum); +#endif + start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset)); - storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr); + storepix(result, dst + start_addr); } } diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 8a317ae13d..23f4b6268c 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -139,9 +139,13 @@ #endif #define DIG(a) a, +#if defined(INTEGER_ARITHMETIC) +__constant int mat_kernel[] = { COEFF }; +#else __constant dstT1 mat_kernel[] = { COEFF }; +#endif -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) #define dstT4 int4 #define convertDstVec convert_int4 #else @@ -263,7 +267,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel { temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); #else sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); @@ -368,7 +372,7 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse { temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); #else sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 1f96d7d6e1..a91cf7b0e4 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -160,7 +160,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int { sum = (WT) 0; for (i=0; i<=2*RADIUSY; i++) -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum); #else sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum); @@ -177,25 +177,27 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int { // do second horizontal filter pass // and calculate final result - sum = 0.0f; + sum = (WT)(delta); for (i=0; i<=2*RADIUSX; i++) -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #else sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #endif -#ifdef INTEGER_ARITHMETIC -#ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); +#if defined(SHIFT_BITS) && SHIFT_BITS > 0 +#if !defined(INTEGER_ARITHMETIC) + sum = sum * (1.0f / (1 << SHIFT_BITS)); #else sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif #endif + // store result into destination image - storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); + storepix(convertToDstT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); } + barrier(CLK_LOCAL_MEM_FENCE); for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y) { int clocX = i % (BLK_X+(RADIUSX*2)); diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index 65122d20e2..65d1fc8ed6 100644 --- a/modules/imgproc/src/smooth.dispatch.cpp +++ b/modules/imgproc/src/smooth.dispatch.cpp @@ -48,6 +48,7 @@ #include #include +#include #include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" @@ -637,10 +638,9 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, return; } - bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && - ((ksize.width == 3 && ksize.height == 3) || - (ksize.width == 5 && ksize.height == 5)) && - _src.rows() > ksize.height && _src.cols() > ksize.width); + bool useOpenCL = ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && + _src.rows() >= ksize.height && _src.cols() >= ksize.width && + ksize.width > 1 && ksize.height > 1; CV_UNUSED(useOpenCL); int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); @@ -648,27 +648,13 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); + CV_OCL_RUN(useOpenCL && sdepth == CV_8U && + ((ksize.width == 3 && ksize.height == 3) || + (ksize.width == 5 && ksize.height == 5)), + ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType) + ); - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType & BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - sigma1, sigma2, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) - - if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) + if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix())) { std::vector fkx, fky; createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); @@ -684,6 +670,17 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, } else { + CV_OCL_RUN(useOpenCL, + ocl_sepFilter2D_BitExact(_src, _dst, sdepth, + ksize, + (const uint16_t*)&fkx[0], (const uint16_t*)&fky[0], + Point(-1, -1), 0, borderType, + 8/*shift_bits*/) + ); + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + if (src.data == dst.data) src = src.clone(); CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType), @@ -692,6 +689,29 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, } } +#ifdef HAVE_OPENCL + if (useOpenCL) + { + sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); + return; + } +#endif + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + + Point ofs; + Size wsz(src.cols, src.rows); + if(!(borderType & BORDER_ISOLATED)) + src.locateROI( wsz, ofs ); + + CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, + ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, + sigma1, sigma2, borderType&~BORDER_ISOLATED); + + CV_OVX_RUN(true, + openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) + #if defined ENABLE_IPP_GAUSSIAN_BLUR // IPP is not bit-exact to OpenCV implementation CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); diff --git a/modules/imgproc/test/ocl/test_sepfilter2d.cpp b/modules/imgproc/test/ocl/test_sepfilter2d.cpp index 9b1f1690ae..12f247ed36 100644 --- a/modules/imgproc/test/ocl/test_sepfilter2d.cpp +++ b/modules/imgproc/test/ocl/test_sepfilter2d.cpp @@ -73,7 +73,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) useRoi = GET_PARAM(4); } - void random_roi() + void random_roi(bool bitExact) { Size ksize = randomSize(kernelMinSize, kernelMaxSize); if (1 != ksize.width % 2) @@ -81,11 +81,19 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) if (1 != ksize.height % 2) ksize.height++; - Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); + Mat temp = randomMat(Size(ksize.width, 1), CV_32FC1, -0.5, 1.0); cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1); - temp = randomMat(Size(1, ksize.height), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); + temp = randomMat(Size(1, ksize.height), CV_32FC1, -0.5, 1.0); cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); + if (bitExact) + { + kernelX.convertTo(temp, CV_32S, 256); + temp.convertTo(kernelX, CV_32F, 1.0 / 256); + kernelY.convertTo(temp, CV_32S, 256); + temp.convertTo(kernelY, CV_32F, 1.0 / 256); + } + Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); @@ -96,6 +104,11 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) anchor.x = anchor.y = -1; delta = randomDouble(-100, 100); + if (bitExact) + { + delta = (int)(delta * 256) / 256.0; + } + UMAT_UPLOAD_INPUT_PARAMETER(src); UMAT_UPLOAD_OUTPUT_PARAMETER(dst); } @@ -110,7 +123,7 @@ OCL_TEST_P(SepFilter2D, Mat) { for (int j = 0; j < test_loop_times + 3; j++) { - random_roi(); + random_roi(false); OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); @@ -119,6 +132,22 @@ OCL_TEST_P(SepFilter2D, Mat) } } +OCL_TEST_P(SepFilter2D, Mat_BitExact) +{ + for (int j = 0; j < test_loop_times + 3; j++) + { + random_roi(true); + + OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); + OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); + + if (src_roi.depth() < CV_32F) + Near(0.0); + else + Near(1e-3); + } +} + OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D, Combine( Values(CV_8U, CV_32F), diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp index c204af220e..7213349ccc 100644 --- a/modules/stitching/src/exposure_compensate.cpp +++ b/modules/stitching/src/exposure_compensate.cpp @@ -416,7 +416,11 @@ void BlocksCompensator::feed(const std::vector &corners, const std::vecto bl_idx += bl_per_img.width*bl_per_img.height; for (int i=0; i + $ + $ +diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt +index 85524310..ed27e058 100644 +--- a/inference-engine/src/legacy_api/CMakeLists.txt ++++ b/inference-engine/src/legacy_api/CMakeLists.txt +@@ -21,7 +21,7 @@ source_group("include" FILES ${PUBLIC_HEADERS}) + + # Create object library + +-add_library(${TARGET_NAME}_obj OBJECT ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL + ${LIBRARY_SRC} + ${PUBLIC_HEADERS}) + +diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt +index 297783da..06da35c3 100644 +--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt ++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt +@@ -192,7 +192,7 @@ cross_compiled_file(${TARGET_NAME} + + # add test object library + +-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS}) ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS}) + + target_include_directories(${TARGET_NAME}_obj PRIVATE $ + $ +diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt +index adc52f06..6b7d0ffe 100644 +--- a/inference-engine/src/preprocessing/CMakeLists.txt ++++ b/inference-engine/src/preprocessing/CMakeLists.txt +@@ -124,7 +124,7 @@ endif() + + # Create object library + +-add_library(${TARGET_NAME}_obj OBJECT ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL + ${LIBRARY_SRC} + ${LIBRARY_HEADERS}) + +@@ -183,7 +183,7 @@ add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME} + + # Static library used for unit tests which are always built + +-add_library(${TARGET_NAME}_s STATIC ++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL + $) + + set_ie_threading_interface_for(${TARGET_NAME}_s) +diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt +index 43e9308f..2e40dd31 100644 +--- a/inference-engine/src/vpu/common/CMakeLists.txt ++++ b/inference-engine/src/vpu/common/CMakeLists.txt +@@ -55,7 +55,7 @@ add_common_target("vpu_common_lib" FALSE) + + # Unit tests support for graph transformer + if(WIN32) +- add_common_target("vpu_common_lib_test_static" TRUE) ++ #add_common_target("vpu_common_lib_test_static" TRUE) + else() + add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib") + endif() +diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt +index 982d3c7f..15fcf3e8 100644 +--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt ++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt +@@ -64,7 +64,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE) + + # Unit tests support for graph transformer + if(WIN32) +- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE) ++ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE) + else() + add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer") + endif() +diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt +index f94453e0..c80e75c5 100644 +--- a/inference-engine/thirdparty/CMakeLists.txt ++++ b/inference-engine/thirdparty/CMakeLists.txt +@@ -43,13 +43,13 @@ function(build_with_lto) + endfunction() + + ie_build_pugixml() +- add_subdirectory(stb_lib) ++ #add_subdirectory(stb_lib) + add_subdirectory(ade) + add_subdirectory(fluid/modules/gapi) + + target_include_directories(pugixml INTERFACE "$") + +- set_target_properties(pugixml ade fluid stb_image ++ set_target_properties(pugixml ade fluid + PROPERTIES FOLDER thirdparty) + + # developer package +diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt +index 8bcb2801..380fb468 100644 +--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt ++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt +@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS) + else() + add_library(pugixml STATIC ${SOURCES}) + if (MSVC) +- add_library(pugixml_mt STATIC ${SOURCES}) ++ #add_library(pugixml_mt STATIC ${SOURCES}) + #if (WIN32) + # set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") + # set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") diff --git a/platforms/winpack_dldt/2020.4/patch.config.py b/platforms/winpack_dldt/2020.4/patch.config.py new file mode 100644 index 0000000000..496f383800 --- /dev/null +++ b/platforms/winpack_dldt/2020.4/patch.config.py @@ -0,0 +1,3 @@ +applyPatch('20200701-dldt-disable-unused-targets.patch') +applyPatch('20200413-dldt-pdb.patch') +applyPatch('20200604-dldt-disable-multidevice.patch') diff --git a/platforms/winpack_dldt/2020.4/sysroot.config.py b/platforms/winpack_dldt/2020.4/sysroot.config.py new file mode 100644 index 0000000000..fc8dffd32a --- /dev/null +++ b/platforms/winpack_dldt/2020.4/sysroot.config.py @@ -0,0 +1,56 @@ +sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin') +copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph') +#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll') + +build_config = 'Release' if not self.config.build_debug else 'Debug' +build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config + +def copy_bin(name): + global build_bin_dir, sysroot_bin_dir + copytree(build_bin_dir / name, sysroot_bin_dir / name) + +dll_suffix = 'd' if self.config.build_debug else '' +def copy_dll(name): + global copy_bin, dll_suffix + copy_bin(name + dll_suffix + '.dll') + copy_bin(name + dll_suffix + '.pdb') + +copy_bin('cache.json') +copy_dll('clDNNPlugin') +copy_dll('HeteroPlugin') +copy_dll('inference_engine') +copy_dll('inference_engine_ir_reader') +copy_dll('inference_engine_legacy') +copy_dll('inference_engine_transformations') # runtime +copy_dll('inference_engine_lp_transformations') # runtime +copy_dll('MKLDNNPlugin') # runtime +copy_dll('myriadPlugin') # runtime +#copy_dll('MultiDevicePlugin') # runtime, not used +copy_dll('ngraph') +copy_bin('plugins.xml') +copytree(self.build_dir / 'bin' / 'intel64' / 'pcie-ma248x.elf', sysroot_bin_dir / 'pcie-ma248x.elf') +copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2x8x.mvcmd', sysroot_bin_dir / 'usb-ma2x8x.mvcmd') +copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2450.mvcmd', sysroot_bin_dir / 'usb-ma2450.mvcmd') + +copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir) +copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb') + +sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine') +sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64') + +copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include') +if not self.config.build_debug: + copytree(self.build_dir / 'install' / 'lib' / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib') + copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib') + copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib') + copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib') +else: + copytree(self.build_dir / 'install' / 'lib' / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib') + copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib') + copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib') + copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib') + +sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses') +copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE') +copytree(self.srcdir / 'ngraph/LICENSE', sysroot_license_dir / 'ngraph-LICENSE') +copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE') diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py index 07ad96b5ba..8f3724135d 100644 --- a/platforms/winpack_dldt/build_package.py +++ b/platforms/winpack_dldt/build_package.py @@ -133,9 +133,10 @@ def git_checkout(dst, url, branch, revision, clone_extra_args, noFetch=False): (['-b', branch] if branch else []) + clone_extra_args + [url, '.'], cwd=dst) else: - execute(cmd=['git', 'fetch', 'origin'] + ([branch] if branch else []), cwd=dst) + execute(cmd=['git', 'fetch', 'origin'] + ([branch + ':' + branch] if branch else []), cwd=dst) execute(cmd=['git', 'reset', '--hard'], cwd=dst) - execute(cmd=['git', 'checkout', '-B', 'winpack_dldt', revision], cwd=dst) + execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst) + execute(cmd=['git', 'checkout', '--force', '-B', 'winpack_dldt', revision], cwd=dst) execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst) execute(cmd=['git', 'submodule', 'init'], cwd=dst) execute(cmd=['git', 'submodule', 'update', '--force', '--depth=1000'], cwd=dst) @@ -149,6 +150,7 @@ def git_apply_patch(src_dir, patch_file): patch_file = str(patch_file) # Python 3.5 may not handle Path assert os.path.exists(patch_file), patch_file execute(cmd=['git', 'apply', '--3way', '-v', '--ignore-space-change', str(patch_file)], cwd=src_dir) + execute(cmd=['git', 'diff', 'HEAD'], cwd=src_dir) #=================================================================================================== @@ -186,6 +188,17 @@ class BuilderDLDT: self.build_dir = prepare_dir(self.outdir / 'build', clean=self.config.clean_dldt) self.sysrootdir = prepare_dir(self.outdir / 'sysroot', clean=self.config.clean_dldt) + if self.config.build_subst_drive: + if os.path.exists(self.config.build_subst_drive + ':\\'): + execute(['subst', self.config.build_subst_drive + ':', '/D']) + execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)]) + def fix_path(p): + return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':') + self.srcdir = Path(fix_path(self.srcdir)) + self.build_dir = Path(fix_path(self.build_dir)) + self.sysrootdir = Path(fix_path(self.sysrootdir)) + + def init_patchset(self): cpath = self.cpath self.patch_file = str(cpath / 'patch.config.py') # Python 3.5 may not handle Path @@ -255,12 +268,14 @@ class BuilderDLDT: BUILD_TESTS='OFF', ENABLE_OPENCV='OFF', ENABLE_GNA='OFF', + ENABLE_SPEECH_DEMO='OFF', # 2020.4+ NGRAPH_DOC_BUILD_ENABLE='OFF', NGRAPH_UNIT_TEST_ENABLE='OFF', NGRAPH_UNIT_TEST_OPENVINO_ENABLE='OFF', NGRAPH_TEST_UTIL_ENABLE='OFF', NGRAPH_ONNX_IMPORT_ENABLE='OFF', CMAKE_INSTALL_PREFIX=str(self.build_dir / 'install'), + OUTPUT_ROOT=str(self.build_dir), # 2020.4+ ) cmd += [ '-D%s=%s' % (k, v) for (k, v) in cmake_vars.items() if v is not None] @@ -270,14 +285,6 @@ class BuilderDLDT: cmd.append(str(self.srcdir)) build_dir = self.build_dir - if self.config.build_subst_drive: - if os.path.exists(self.config.build_subst_drive + ':\\'): - execute(['subst', self.config.build_subst_drive + ':', '/D']) - def fix_path(p): - return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':') - execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)]) - cmd = [fix_path(c) for c in cmd] - build_dir = Path(fix_path(build_dir)) try: execute(cmd, cwd=build_dir) @@ -291,8 +298,6 @@ class BuilderDLDT: cmd = [self.cmake_path, '-DBUILD_TYPE=' + build_config, '-P', 'cmake_install.cmake'] execute(cmd, cwd=build_dir / 'ngraph') except: - if self.config.build_subst_drive: - execute(['subst', self.config.build_subst_drive + ':', '/D']) raise log.info('DLDT build completed') @@ -307,6 +312,11 @@ class BuilderDLDT: log.info('DLDT sysroot preparation completed') + def cleanup(self): + if self.config.build_subst_drive: + execute(['subst', self.config.build_subst_drive + ':', '/D']) + + #=================================================================================================== class Builder: @@ -466,7 +476,7 @@ def main(): parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)') parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)') - parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to DLDT commit)') + parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)') args = parser.parse_args() @@ -492,7 +502,10 @@ def main(): args.opencv_dir = os.path.abspath(args.opencv_dir) if not args.dldt_config: - args.dldt_config = args.dldt_src_commit + if args.dldt_src_commit == 'releases/2020/4' or args.dldt_src_branch == 'releases/2020/4': + args.dldt_config = '2020.4' + else: + args.dldt_config = args.dldt_src_commit _opencv_dir = check_dir(args.opencv_dir) _outdir = prepare_dir(args.output_dir) @@ -504,14 +517,18 @@ def main(): builder_dldt = BuilderDLDT(args) - builder_dldt.prepare_sources() - builder_dldt.build() - builder_dldt.make_sysroot() - - builder_opencv = Builder(args) - builder_opencv.build(builder_dldt) - builder_opencv.copy_sysroot(builder_dldt) - builder_opencv.package_sources() + try: + builder_dldt.prepare_sources() + builder_dldt.build() + builder_dldt.make_sysroot() + + builder_opencv = Builder(args) + builder_opencv.build(builder_dldt) + builder_opencv.copy_sysroot(builder_dldt) + builder_opencv.package_sources() + except: + builder_dldt.cleanup() + raise log.info("=====") log.info("===== Build finished")