Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/11495/head
Alexander Alekhin 7 years ago
commit 51e543050c
  1. 2
      doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
  2. 4
      doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
  3. 6
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  4. 37
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  5. 4
      modules/core/include/opencv2/core/operations.hpp
  6. 13
      modules/core/test/test_intrin_utils.hpp
  7. 19
      modules/cudafilters/src/cuda/column_filter.hpp
  8. 19
      modules/cudafilters/src/cuda/row_filter.hpp
  9. 17
      modules/cudaimgproc/src/canny.cpp
  10. 230
      modules/cudaimgproc/src/cuda/canny.cu
  11. 59
      modules/cudaimgproc/test/test_canny.cpp
  12. 303
      modules/imgproc/src/color_hsv.cpp
  13. 2
      modules/photo/test/test_hdr.cpp
  14. 1
      modules/stitching/include/opencv2/stitching/detail/blenders.hpp
  15. 7
      modules/videoio/src/cap_ffmpeg_impl.hpp
  16. 6
      modules/videoio/src/cap_qt.cpp
  17. 20
      samples/cpp/tutorial_code/dnn/custom_layers.cpp
  18. 9
      samples/dnn/object_detection.cpp
  19. 8
      samples/dnn/openpose.cpp
  20. 9
      samples/dnn/segmentation.cpp

@ -69,7 +69,7 @@ CvMat* mI = &I.operator CvMat();
@endcode
One of the biggest complaints of the C interface is that it leaves all the memory management to you.
You need to figure out when it is safe to release your unused objects and make sure you do so before
the program finishes or you could have troublesome memory leeks. To work around this issue in OpenCV
the program finishes or you could have troublesome memory leaks. To work around this issue in OpenCV
there is introduced a sort of smart pointer. This will automatically release the object when it's no
longer in use. To use this declare the pointers as a specialization of the *Ptr* :
@code{.cpp}

@ -29,7 +29,7 @@ Execute in webcam:
@code{.bash}
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb
@endcode
@ -37,7 +37,7 @@ Execute with image or video file:
@code{.bash}
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE]
$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb
@endcode

@ -786,10 +786,14 @@ template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
{ return a; } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
{ CV_UNUSED(b); return a; }
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)

@ -589,7 +589,7 @@ inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
}
#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \
#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
template<int imm> \
inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
@ -598,11 +598,13 @@ inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
return imm ? b : a; \
}
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, right, a, b)
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, left, b, a)
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
/* Extract */
template<int s, typename _Tpvec>
@ -716,26 +718,33 @@ inline int v_signmask(const v_uint64x2& a)
inline int v_signmask(const v_float64x2& a)
{ return v_signmask(v_reinterpret_as_s64(a)); }
template<typename _Tpvec>
inline bool v_check_all(const _Tpvec& a)
{ return vec_all_lt(a.val, _Tpvec().val);}
inline bool v_check_all(const v_uint8x16 &a)
{ return vec_all_lt(a.val, _Tpvec().val); }
inline bool v_check_all(const v_uint8x16& a)
{ return v_check_all(v_reinterpret_as_s8(a)); }
inline bool v_check_all(const v_uint16x8 &a)
inline bool v_check_all(const v_uint16x8& a)
{ return v_check_all(v_reinterpret_as_s16(a)); }
inline bool v_check_all(const v_uint32x4 &a)
inline bool v_check_all(const v_uint32x4& a)
{ return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_float32x4& a)
{ return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_float64x2& a)
{ return v_check_all(v_reinterpret_as_s64(a)); }
template<typename _Tpvec>
inline bool v_check_any(const _Tpvec& a)
{ return vec_any_lt(a.val, _Tpvec().val);}
inline bool v_check_any(const v_uint8x16 &a)
{ return vec_any_lt(a.val, _Tpvec().val); }
inline bool v_check_any(const v_uint8x16& a)
{ return v_check_any(v_reinterpret_as_s8(a)); }
inline bool v_check_any(const v_uint16x8 &a)
inline bool v_check_any(const v_uint16x8& a)
{ return v_check_any(v_reinterpret_as_s16(a)); }
inline bool v_check_any(const v_uint32x4 &a)
inline bool v_check_any(const v_uint32x4& a)
{ return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_float32x4& a)
{ return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_float64x2& a)
{ return v_check_any(v_reinterpret_as_s64(a)); }
////////// Other math /////////

@ -194,8 +194,8 @@ Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
{
Matx<_Tp, n, m> b;
bool ok;
if( method == DECOMP_LU || method == DECOMP_CHOLESKY )
ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*this, b, method);
if( m == n && (method == DECOMP_LU || method == DECOMP_CHOLESKY) )
ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*reinterpret_cast<const Matx<_Tp, m, m>*>(this), reinterpret_cast<Matx<_Tp, m, m>&>(b), method);
else
{
Mat A(*this, false), B(b, false);

@ -837,17 +837,28 @@ template<typename R> struct TheTest
Data<R> resC = v_rotate_right<s>(a);
Data<R> resD = v_rotate_right<s>(a, b);
Data<R> resE = v_rotate_left<s>(a);
Data<R> resF = v_rotate_left<s>(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
if (i + s >= R::nlanes)
{
EXPECT_EQ((LaneType)0, resC[i]);
EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]);
EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]);
}
else
{
EXPECT_EQ(dataA[i + s], resC[i]);
}
EXPECT_EQ(dataA[i + s], resD[i]);
EXPECT_EQ(dataA[i], resE[i + s]);
EXPECT_EQ(dataA[i], resF[i + s]);
}
}
return *this;
}

@ -52,10 +52,8 @@ namespace column_filter
{
#define MAX_KERNEL_SIZE 32
__constant__ float c_kernel[MAX_KERNEL_SIZE];
template <int KSIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int BLOCK_DIM_X = 16;
@ -135,7 +133,7 @@ namespace column_filter
#pragma unroll
for (int k = 0; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];
dst(y, x) = saturate_cast<D>(sum);
}
@ -143,7 +141,7 @@ namespace column_filter
}
template <int KSIZE, typename T, typename D, template<typename> class B>
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
{
int BLOCK_DIM_X;
int BLOCK_DIM_Y;
@ -167,7 +165,7 @@ namespace column_filter
B<T> brd(src.rows);
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
cudaSafeCall( cudaGetLastError() );
@ -181,7 +179,7 @@ namespace filter
template <typename T, typename D>
void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] =
{
@ -362,11 +360,6 @@ namespace filter
}
};
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
}
}

@ -52,10 +52,8 @@ namespace row_filter
{
#define MAX_KERNEL_SIZE 32
__constant__ float c_kernel[MAX_KERNEL_SIZE];
template <int KSIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
__global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int BLOCK_DIM_X = 32;
@ -135,7 +133,7 @@ namespace row_filter
#pragma unroll
for (int k = 0; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];
dst(y, x) = saturate_cast<D>(sum);
}
@ -143,7 +141,7 @@ namespace row_filter
}
template <int KSIZE, typename T, typename D, template<typename> class B>
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
{
int BLOCK_DIM_X;
int BLOCK_DIM_Y;
@ -167,7 +165,7 @@ namespace row_filter
B<T> brd(src.cols);
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
@ -180,7 +178,7 @@ namespace filter
template <typename T, typename D>
void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] =
{
@ -361,11 +359,6 @@ namespace filter
}
};
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
}
}

@ -58,9 +58,9 @@ namespace canny
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);
void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
}
@ -127,6 +127,8 @@ namespace
Ptr<Filter> filterDX_, filterDY_;
#endif
int old_apperture_size_;
int *d_counter;
};
void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
@ -218,12 +220,17 @@ namespace
void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
{
map_.setTo(Scalar::all(0));
map_.setTo(Scalar::all(0), stream);
canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
cudaSafeCall( cudaFree(d_counter) );
canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
}

@ -47,6 +47,7 @@
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
@ -102,6 +103,20 @@ namespace canny
}
};
struct SrcTexObject
{
int xoff;
int yoff;
cudaTextureObject_t tex_src_object;
__host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
__device__ __forceinline__ int operator ()(int y, int x) const
{
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
}
};
template <class Norm> __global__
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
@ -120,29 +135,94 @@ namespace canny
mag(y, x) = norm(dxVal, dyVal);
}
template <class Norm> __global__
void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= mag.rows || x >= mag.cols)
return;
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
dx(y, x) = dxVal;
dy(y, x) = dyVal;
mag(y, x) = norm(dxVal, dyVal);
}
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff);
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
if (L2Grad)
if (cc30)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = srcWhole.ptr();
resDesc.res.pitch2D.height = srcWhole.rows;
resDesc.res.pitch2D.width = srcWhole.cols;
resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
SrcTexObject src(xoff, yoff, tex);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
cudaSafeCall( cudaDestroyTextureObject(tex) );
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff);
cudaSafeCall( cudaGetLastError() );
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
}
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@ -165,7 +245,6 @@ namespace canny
namespace canny
{
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{
const int CANNY_SHIFT = 15;
@ -218,18 +297,103 @@ namespace canny
map(y, x) = edge_type;
}
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
return;
int dxVal = dx(y, x);
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D<float>(tex_mag, x, y);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
dyVal <<= CANNY_SHIFT;
if (dyVal < tg22x)
{
if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
}
map(y, x) = edge_type;
}
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
bindTexture(&tex_mag, mag);
if (deviceSupports(FEATURE_SET_COMPUTE_30))
{
// Use the texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = mag.ptr();
resDesc.res.pitch2D.height = mag.rows;
resDesc.res.pitch2D.width = mag.cols;
resDesc.res.pitch2D.pitchInBytes = mag.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
cudaTextureObject_t tex=0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
cudaSafeCall( cudaGetLastError() );
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
else
cudaSafeCall( cudaStreamSynchronize(stream) );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDestroyTextureObject(tex) );
}
else
{
// Use the texture reference
bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}
@ -237,14 +401,12 @@ namespace canny
namespace canny
{
__device__ int counter = 0;
__device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
{
return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
}
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
{
__shared__ volatile int smem[18][18];
@ -325,22 +487,19 @@ namespace canny
if (n > 0)
{
const int ind = ::atomicAdd(&counter, 1);
const int ind = ::atomicAdd(d_counter, 1);
st[ind] = make_short2(x, y);
}
}
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
const dim3 block(16, 16);
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
@ -355,7 +514,7 @@ namespace canny
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
{
const int stack_size = 512;
@ -429,7 +588,7 @@ namespace canny
{
if (threadIdx.x == 0)
{
s_ind = ::atomicAdd(&counter, s_counter);
s_ind = ::atomicAdd(d_counter, s_counter);
if (s_ind + s_counter > map.cols * map.rows)
s_counter = 0;
@ -444,29 +603,26 @@ namespace canny
}
}
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
int count;
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
while (count > 0)
{
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
const dim3 block(128);
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
cudaSafeCall( cudaGetLastError() );
if (stream == NULL)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
count = min(count, map.cols * map.rows);

@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy)
EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
}
class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
{
public:
CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
: d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
~CannyAsyncParallelLoopBody() {};
void operator()(const cv::Range& r) const
{
for (int i = r.start; i < r.end; i++) {
cv::cuda::Stream stream;
cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
canny->detect(d_img, edges[i], stream);
stream.waitForCompletion();
}
}
protected:
const cv::cuda::GpuMat& d_img;
cv::cuda::GpuMat* edges;
double low_thresh;
double high_thresh;
int apperture_size;
bool useL2gradient;
};
#define NUM_STREAMS 64
CUDA_TEST_P(Canny, Async)
{
if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
{
throw SkipTestException("CUDA device doesn't support texture objects");
}
else
{
const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
double low_thresh = 50.0;
double high_thresh = 100.0;
// Synchronous call
cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
cv::cuda::GpuMat edges_gold;
canny->detect(d_img_roi, edges_gold);
// Asynchronous call
cv::cuda::GpuMat edges[NUM_STREAMS];
cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
// Compare the results of synchronous call and asynchronous call
for (int i = 0; i < NUM_STREAMS; i++)
EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
}
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
ALL_DEVICES,
testing::Values(AppertureSize(3), AppertureSize(5)),
testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
testing::Values(L2gradient(false), L2gradient(true)),
WHOLE_SUBMAT));

@ -84,15 +84,97 @@ struct RGB2HSV_f
typedef float channel_type;
RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
: srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
: srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {
#if CV_SIMD128
hasSIMD = hasSIMD128();
#endif
}
#if CV_SIMD128
inline void process(v_float32x4& v_r, v_float32x4& v_g,
v_float32x4& v_b, float hscale) const
{
v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
v_float32x4 v_eps = v_setall_f32(FLT_EPSILON);
v_float32x4 v_diff = v_max_rgb - v_min_rgb;
v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps);
v_float32x4 v_r_eq_max = v_r == v_max_rgb;
v_float32x4 v_g_eq_max = v_g == v_max_rgb;
v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps);
v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale);
v_g = v_s;
v_b = v_max_rgb;
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i, bidx = blueIdx, scn = srccn;
int i = 0, bidx = blueIdx, scn = srccn;
float hscale = hrange*(1.f/360.f);
n *= 3;
for( i = 0; i < n; i += 3, src += scn )
#if CV_SIMD128
if (hasSIMD)
{
if (scn == 3) {
if (bidx) {
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
v_float32x4 v_r;
v_float32x4 v_g;
v_float32x4 v_b;
v_load_deinterleave(src, v_r, v_g, v_b);
process(v_r, v_g, v_b, hscale);
v_store_interleave(dst + i, v_r, v_g, v_b);
}
} else {
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
v_float32x4 v_r;
v_float32x4 v_g;
v_float32x4 v_b;
v_load_deinterleave(src, v_r, v_g, v_b);
process(v_b, v_g, v_r, hscale);
v_store_interleave(dst + i, v_b, v_g, v_r);
}
}
} else { // scn == 4
if (bidx) {
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
v_float32x4 v_r;
v_float32x4 v_g;
v_float32x4 v_b;
v_float32x4 v_a;
v_load_deinterleave(src, v_r, v_g, v_b, v_a);
process(v_r, v_g, v_b, hscale);
v_store_interleave(dst + i, v_r, v_g, v_b);
}
} else {
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
v_float32x4 v_r;
v_float32x4 v_g;
v_float32x4 v_b;
v_float32x4 v_a;
v_load_deinterleave(src, v_r, v_g, v_b, v_a);
process(v_b, v_g, v_r, hscale);
v_store_interleave(dst + i, v_b, v_g, v_r);
}
}
}
}
#endif
for( ; i < n; i += 3, src += scn )
{
float b = src[bidx], g = src[1], r = src[bidx^2];
float h, s, v;
@ -125,6 +207,9 @@ struct RGB2HSV_f
int srccn, blueIdx;
float hrange;
#if CV_SIMD128
bool hasSIMD;
#endif
};
@ -855,167 +940,111 @@ struct HLS2RGB_f
HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
: dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
#if CV_SSE2
haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
hasSIMD = hasSIMD128();
#endif
}
#if CV_SSE2
void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
__m128& v_l1, __m128& v_s0, __m128& v_s1) const
#if CV_SIMD128
inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const
{
__m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
__m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
__m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
__m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
__m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
__m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
__m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
__m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
__m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
__m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
__m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
__m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
__m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
__m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
__m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
__m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
v_float32x4 v_one = v_setall_f32(1.0f);
v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f);
v_float32x4 v_ls = v_l * v_s;
v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls);
v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale);
v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw));
v_float32x4 v_hs = v_hs_raw - v_pre_hs;
v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f)));
v_float32x4 v_elem1 = v_hs + v_hs;
v_float32x4 v_tab0 = v_l + v_elem0;
v_float32x4 v_tab1 = v_l - v_elem0;
v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1;
v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1;
v_float32x4 v_two = v_setall_f32(2.0f);
v_float32x4 v_four = v_setall_f32(4.0f);
v_h = v_select(v_sector < v_two , v_tab1,
v_select(v_sector <= v_two , v_tab3,
v_select(v_sector <= v_four, v_tab0, v_tab2)));
v_l = v_select(v_sector < v_one , v_tab3,
v_select(v_sector <= v_two , v_tab0,
v_select(v_sector < v_four, v_tab2, v_tab1)));
v_s = v_select(v_sector < v_one , v_tab0,
v_select(v_sector < v_two , v_tab2,
v_select(v_sector < v_four, v_tab1,
v_select(v_sector <= v_four, v_tab3, v_tab0))));
}
#endif
void operator()(const float* src, float* dst, int n) const
{
int i = 0, bidx = blueIdx, dcn = dstcn;
float _hscale = hscale;
float alpha = ColorChannel<float>::max();
n *= 3;
#if CV_SSE2
if (haveSIMD)
#if CV_SIMD128
if (hasSIMD)
{
for( ; i <= n - 24; i += 24, dst += dcn * 8 )
if (dcn == 3)
{
__m128 v_h0 = _mm_loadu_ps(src + i + 0);
__m128 v_h1 = _mm_loadu_ps(src + i + 4);
__m128 v_l0 = _mm_loadu_ps(src + i + 8);
__m128 v_l1 = _mm_loadu_ps(src + i + 12);
__m128 v_s0 = _mm_loadu_ps(src + i + 16);
__m128 v_s1 = _mm_loadu_ps(src + i + 20);
_mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
if (dcn == 3)
if (bidx)
{
if (bidx)
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_store_interleave(dst, v_s, v_l, v_h);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_store_interleave(dst, v_h, v_l, v_s);
}
}
else
} else { // dcn == 4
if (bidx)
{
__m128 v_a0 = _mm_set1_ps(alpha);
__m128 v_a1 = _mm_set1_ps(alpha);
if (bidx)
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_s0);
_mm_storeu_ps(dst + 4, v_s1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_h0);
_mm_storeu_ps(dst + 20, v_h1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_float32x4 v_a = v_setall_f32(alpha);
v_store_interleave(dst, v_s, v_l, v_h, v_a);
}
else
} else {
for (; i <= n - 12; i += 12, dst += dcn * 4)
{
_mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
_mm_storeu_ps(dst + 0, v_h0);
_mm_storeu_ps(dst + 4, v_h1);
_mm_storeu_ps(dst + 8, v_l0);
_mm_storeu_ps(dst + 12, v_l1);
_mm_storeu_ps(dst + 16, v_s0);
_mm_storeu_ps(dst + 20, v_s1);
_mm_storeu_ps(dst + 24, v_a0);
_mm_storeu_ps(dst + 28, v_a1);
v_float32x4 v_h;
v_float32x4 v_l;
v_float32x4 v_s;
v_load_deinterleave(src + i, v_h, v_l, v_s);
process(v_h, v_l, v_s);
v_float32x4 v_a = v_setall_f32(alpha);
v_store_interleave(dst, v_h, v_l, v_s, v_a);
}
}
}
}
#endif
for( ; i < n; i += 3, dst += dcn )
{
float h = src[i], l = src[i+1], s = src[i+2];
@ -1033,7 +1062,7 @@ struct HLS2RGB_f
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= _hscale;
h *= hscale;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -1063,8 +1092,8 @@ struct HLS2RGB_f
int dstcn, blueIdx;
float hscale;
#if CV_SSE2
bool haveSIMD;
#if CV_SIMD128
bool hasSIMD;
#endif
};

@ -213,7 +213,7 @@ TEST(Photo_MergeRobertson, regression)
loadImage(test_path + "merge/robertson.hdr", expected);
merge->process(images, result, times);
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__PPC64__)
const float eps = 6.f;
#else
const float eps = 5.f;

@ -48,6 +48,7 @@
#endif
#include "opencv2/core.hpp"
#include "opencv2/core/cuda.hpp"
namespace cv {
namespace detail {

@ -870,7 +870,12 @@ bool CvCapture_FFMPEG::open( const char* _filename )
int enc_width = enc->width;
int enc_height = enc->height;
AVCodec *codec = avcodec_find_decoder(enc->codec_id);
AVCodec *codec;
if(av_dict_get(dict, "video_codec", NULL, 0) == NULL) {
codec = avcodec_find_decoder(enc->codec_id);
} else {
codec = avcodec_find_decoder_by_name(av_dict_get(dict, "video_codec", NULL, 0)->value);
}
if (!codec ||
#if LIBAVCODEC_VERSION_INT >= ((53<<16)+(8<<8)+0)
avcodec_open2(enc, codec, NULL)

@ -634,11 +634,11 @@ static int icvOpenCamera_QT (CvCapture_QT_Cam * capture, const int index)
OPENCV_ASSERT (capture, "icvOpenCamera_QT", "'capture' is a NULL-pointer");
OPENCV_ASSERT (index >=0, "icvOpenCamera_QT", "camera index is negative");
ComponentDescription component_description;
Component component = 0;
ComponentDescription component_description;
Component component = 0;
int number_of_inputs = 0;
Rect myRect;
ComponentResult result = noErr;
ComponentResult result = noErr;
// travers all components and count video digitizer channels

@ -16,18 +16,18 @@ public:
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
const int requiredOutputs,
std::vector<std::vector<int> > &outputs,
std::vector<std::vector<int> > &internals) const;
std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
//! [MyLayer::getMemoryShapes]
//! [MyLayer::forward]
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals);
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
//! [MyLayer::forward]
//! [MyLayer::finalize]
virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs);
virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
//! [MyLayer::finalize]
virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals);
virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
};
//! [A custom layer interface]
@ -49,7 +49,7 @@ public:
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
const int requiredOutputs,
std::vector<std::vector<int> > &outputs,
std::vector<std::vector<int> > &internals) const
std::vector<std::vector<int> > &internals) const CV_OVERRIDE
{
CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
std::vector<int> outShape(4);
@ -62,7 +62,7 @@ public:
}
// Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
{
CV_UNUSED(internals);
cv::Mat& inp = *inputs[0];
@ -105,7 +105,7 @@ public:
}
}
virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
private:
int outWidth, outHeight;
@ -132,7 +132,7 @@ public:
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
const int requiredOutputs,
std::vector<std::vector<int> > &outputs,
std::vector<std::vector<int> > &internals) const
std::vector<std::vector<int> > &internals) const CV_OVERRIDE
{
CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
std::vector<int> outShape(4);
@ -146,7 +146,7 @@ public:
// This implementation is based on a reference implementation from
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
{
CV_UNUSED(internals);
cv::Mat& inp = *inputs[0];
@ -187,7 +187,7 @@ public:
}
}
virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}
private:
static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)

@ -7,12 +7,13 @@
const char* keys =
"{ help h | | Print help message. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ device | 0 | camera device number. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
"{ model m | | Path to a binary file of model contains trained weights. "
"It could be a file with extensions .caffemodel (Caffe), "
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet).}"
"{ config c | | Path to a text file of model contains network configuration. "
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet).}"
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
"{ classes | | Optional path to a text file with names of classes to label detected objects. }"
"{ mean | | Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces. }"
@ -91,7 +92,7 @@ int main(int argc, char** argv)
if (parser.has("input"))
cap.open(parser.get<String>("input"));
else
cap.open(0);
cap.open(parser.get<int>("device"));
// Process frames.
Mat frame, blob;

@ -61,12 +61,16 @@ int main(int argc, char **argv)
"{ p proto | | (required) model configuration, e.g. hand/pose.prototxt }"
"{ m model | | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
"{ i image | | (required) path to image file (containing a single person, or hand) }"
"{ width | 368 | Preprocess input image by resizing to a specific width. }"
"{ height | 368 | Preprocess input image by resizing to a specific height. }"
"{ t threshold | 0.1 | threshold or confidence value for the heatmap }"
);
String modelTxt = parser.get<string>("proto");
String modelBin = parser.get<string>("model");
String imageFile = parser.get<String>("image");
int W_in = parser.get<int>("width");
int H_in = parser.get<int>("height");
float thresh = parser.get<float>("threshold");
if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
{
@ -75,10 +79,6 @@ int main(int argc, char **argv)
return 0;
}
// fixed input size for the pretrained network
int W_in = 368;
int H_in = 368;
// read the network model
Net net = readNetFromCaffe(modelTxt, modelBin);

@ -7,12 +7,13 @@
const char* keys =
"{ help h | | Print help message. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ device | 0 | camera device number. }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
"{ model m | | Path to a binary file of model contains trained weights. "
"It could be a file with extensions .caffemodel (Caffe), "
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet). }"
"{ config c | | Path to a text file of model contains network configuration. "
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet). }"
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
"{ classes | | Optional path to a text file with names of classes. }"
"{ colors | | Optional path to a text file with colors for an every class. "
@ -111,7 +112,7 @@ int main(int argc, char** argv)
if (parser.has("input"))
cap.open(parser.get<String>("input"));
else
cap.open(0);
cap.open(parser.get<int>("device"));
//! [Open a video file or an image file or a camera stream]
// Process frames.

Loading…
Cancel
Save