mirror of https://github.com/opencv/opencv.git
pull/13383/head
parent
d99f4a2beb
commit
be8e31f14d
24 changed files with 2862 additions and 2595 deletions
@ -0,0 +1,464 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/vec_distance.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher |
||||
{ |
||||
template <typename VecDiff, typename Dist, typename T, typename Mask> |
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx, |
||||
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, |
||||
typename Dist::result_type* smem) |
||||
{ |
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); |
||||
|
||||
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; |
||||
|
||||
distMin1 = numeric_limits<typename Dist::result_type>::max(); |
||||
distMin2 = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
bestTrainIdx1 = -1; |
||||
bestTrainIdx2 = -1; |
||||
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) |
||||
{ |
||||
if (m(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
const T* trainRow = train.ptr(trainIdx); |
||||
|
||||
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); |
||||
|
||||
const typename Dist::result_type val = dist; |
||||
|
||||
if (val < distMin1) |
||||
{ |
||||
distMin1 = val; |
||||
bestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask> |
||||
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance) |
||||
{ |
||||
typedef typename Dist::result_type result_type; |
||||
typedef typename Dist::value_type value_type; |
||||
|
||||
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
result_type distMin1; |
||||
result_type distMin2; |
||||
|
||||
int bestTrainIdx1; |
||||
int bestTrainIdx2; |
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); |
||||
__syncthreads(); |
||||
|
||||
volatile result_type* sdistMinRow = smem; |
||||
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
sdistMinRow[threadIdx.y] = distMin1; |
||||
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; |
||||
|
||||
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; |
||||
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; |
||||
} |
||||
__syncthreads(); |
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0) |
||||
{ |
||||
distMin1 = numeric_limits<result_type>::max(); |
||||
distMin2 = numeric_limits<result_type>::max(); |
||||
|
||||
bestTrainIdx1 = -1; |
||||
bestTrainIdx2 = -1; |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i < BLOCK_DIM_Y; ++i) |
||||
{ |
||||
result_type val = sdistMinRow[i]; |
||||
|
||||
if (val < distMin1) |
||||
{ |
||||
distMin1 = val; |
||||
bestTrainIdx1 = sbestTrainIdxRow[i]; |
||||
} |
||||
else if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = sbestTrainIdxRow[i]; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) |
||||
{ |
||||
result_type val = sdistMinRow[i]; |
||||
|
||||
if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = sbestTrainIdxRow[i]; |
||||
} |
||||
} |
||||
|
||||
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2); |
||||
distance[queryIdx] = make_float2(distMin1, distMin2); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Knn 2 Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask> |
||||
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length |
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Knn 2 Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (query.cols < 64) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 64, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 64) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 64, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 128) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 128, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 128 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 128, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 256 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 256, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 256 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 256, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
knnMatch2Simple_caller<16, 16, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Calc distance kernel |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance) |
||||
{ |
||||
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
const T* queryDescs = query.ptr(queryIdx); |
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; |
||||
|
||||
if (trainIdx < train.rows) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
if (mask(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
myDist = dist; |
||||
} |
||||
|
||||
if (threadIdx.x == 0) |
||||
distance.ptr(queryIdx)[trainIdx] = myDist; |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Calc distance kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); |
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) |
||||
{ |
||||
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// find knn match kernel |
||||
|
||||
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) |
||||
{ |
||||
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; |
||||
__shared__ float sdist[SMEM_SIZE]; |
||||
__shared__ int strainIdx[SMEM_SIZE]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
float* allDist = allDist_.ptr(queryIdx); |
||||
int* trainIdx = trainIdx_.ptr(queryIdx); |
||||
float* distance = distance_.ptr(queryIdx); |
||||
|
||||
float dist = numeric_limits<float>::max(); |
||||
int bestIdx = -1; |
||||
|
||||
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) |
||||
{ |
||||
float reg = allDist[i]; |
||||
if (reg < dist) |
||||
{ |
||||
dist = reg; |
||||
bestIdx = i; |
||||
} |
||||
} |
||||
|
||||
sdist[threadIdx.x] = dist; |
||||
strainIdx[threadIdx.x] = bestIdx; |
||||
__syncthreads(); |
||||
|
||||
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>()); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
if (dist < numeric_limits<float>::max()) |
||||
{ |
||||
allDist[bestIdx] = numeric_limits<float>::max(); |
||||
trainIdx[i] = bestIdx; |
||||
distance[i] = dist; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// find knn match kernel caller |
||||
|
||||
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_SIZE, 1, 1); |
||||
const dim3 grid(trainIdx.rows, 1, 1); |
||||
|
||||
for (int i = 0; i < k; ++i) |
||||
{ |
||||
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) |
||||
{ |
||||
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// knn match Dispatcher |
||||
|
||||
template <typename Dist, typename T> |
||||
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
if (k == 2) |
||||
{ |
||||
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream); |
||||
return; |
||||
} |
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream); |
||||
} |
||||
else |
||||
{ |
||||
if (k == 2) |
||||
{ |
||||
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream); |
||||
return; |
||||
} |
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream); |
||||
} |
||||
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// knn match caller |
||||
|
||||
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
//template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
//template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,403 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/vec_distance.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher |
||||
{ |
||||
template <int BLOCK_DIM_Y, typename T> |
||||
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) |
||||
{ |
||||
if (threadIdx.x == 0) |
||||
{ |
||||
smin[threadIdx.y] = myDist; |
||||
sIdx[threadIdx.y] = myIdx; |
||||
} |
||||
__syncthreads(); |
||||
|
||||
reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>()); |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename T, typename Mask> |
||||
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) |
||||
{ |
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) |
||||
{ |
||||
if (m(queryIdx, trainIdx)) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
Dist dist; |
||||
|
||||
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
const typename Dist::result_type res = dist; |
||||
|
||||
if (res < myDist) |
||||
{ |
||||
myDist = res; |
||||
myIdx.x = trainIdx; |
||||
myIdx.y = imgIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <typename T> struct SingleTrain |
||||
{ |
||||
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_) |
||||
{ |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask> |
||||
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const |
||||
{ |
||||
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
|
||||
__device__ __forceinline__ int desc_len() const |
||||
{ |
||||
return train.cols; |
||||
} |
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, |
||||
float myDist, const int2& myIdx, int queryIdx) |
||||
{ |
||||
trainIdx[queryIdx] = myIdx.x; |
||||
distance[queryIdx] = myDist; |
||||
} |
||||
|
||||
const DevMem2D_<T> train; |
||||
}; |
||||
|
||||
template <typename T> struct TrainCollection |
||||
{ |
||||
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : |
||||
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) |
||||
{ |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask> |
||||
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const |
||||
{ |
||||
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) |
||||
{ |
||||
const DevMem2D_<T> train = trainCollection[imgIdx]; |
||||
m.next(); |
||||
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ int desc_len() const |
||||
{ |
||||
return desclen; |
||||
} |
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, |
||||
float myDist, const int2& myIdx, int queryIdx) |
||||
{ |
||||
trainIdx[queryIdx] = myIdx.x; |
||||
imgIdx[queryIdx] = myIdx.y; |
||||
distance[queryIdx] = myDist; |
||||
} |
||||
|
||||
const DevMem2D_<T>* trainCollection; |
||||
const int nImg; |
||||
const int desclen; |
||||
}; |
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask> |
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) |
||||
{ |
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); |
||||
|
||||
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; |
||||
|
||||
Mask m = mask; |
||||
|
||||
myIdx.x = -1; |
||||
myIdx.y = -1; |
||||
myDist = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask> |
||||
__global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) |
||||
{ |
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
int2 myIdx; |
||||
typename Dist::result_type myDist; |
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem); |
||||
__syncthreads(); |
||||
|
||||
typename Dist::result_type* smin = smem; |
||||
int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); |
||||
|
||||
findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx); |
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0) |
||||
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask> |
||||
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask> |
||||
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp |
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length |
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Train, typename Mask> |
||||
void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (query.cols < 64) |
||||
{ |
||||
matchCached_caller<16, 16, 64, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 64) |
||||
{ |
||||
matchCached_caller<16, 16, 64, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 128) |
||||
{ |
||||
matchCached_caller<16, 16, 128, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 128 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 128, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 256 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 256, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 256 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 256, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
matchSimple_caller<16, 16, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match caller |
||||
|
||||
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
//template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
//template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
//template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,202 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/vec_distance.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher |
||||
{ |
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, |
||||
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 110 |
||||
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
const T* queryDescs = query.ptr(queryIdx); |
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; |
||||
|
||||
if (trainIdx < train.rows) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
if (mask(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
if (dist < maxDistance) |
||||
{ |
||||
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1); |
||||
if (i < trainIdx_.cols) |
||||
{ |
||||
distance.ptr(queryIdx)[i] = dist; |
||||
trainIdx_.ptr(queryIdx)[i] = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
#endif |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); |
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match caller |
||||
|
||||
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
|
||||
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
//template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
|
||||
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
//template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,233 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace bf_krnls |
||||
{ |
||||
__constant__ float* ctable_color; |
||||
__constant__ float* ctable_space; |
||||
__constant__ size_t ctable_space_step; |
||||
|
||||
__constant__ int cndisp; |
||||
__constant__ int cradius; |
||||
|
||||
__constant__ short cedge_disc; |
||||
__constant__ short cmax_disc; |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace bf |
||||
{ |
||||
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) |
||||
{ |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); |
||||
size_t table_space_step = table_space.step / sizeof(float); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); |
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); |
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); |
||||
} |
||||
}}} |
||||
|
||||
namespace bf_krnls |
||||
{ |
||||
template <int channels> |
||||
struct DistRgbMax |
||||
{ |
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) |
||||
{ |
||||
uchar x = abs(a[0] - b[0]); |
||||
uchar y = abs(a[1] - b[1]); |
||||
uchar z = abs(a[2] - b[2]); |
||||
return (max(max(x, y), z)); |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct DistRgbMax<1> |
||||
{ |
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) |
||||
{ |
||||
return abs(a[0] - b[0]); |
||||
} |
||||
}; |
||||
|
||||
template <int channels, typename T> |
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); |
||||
|
||||
T dp[5]; |
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1) |
||||
{ |
||||
dp[0] = *(disp + (y ) * disp_step + x + 0); |
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0); |
||||
dp[2] = *(disp + (y ) * disp_step + x - 1); |
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0); |
||||
dp[4] = *(disp + (y ) * disp_step + x + 1); |
||||
|
||||
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) |
||||
{ |
||||
const int ymin = max(0, y - cradius); |
||||
const int xmin = max(0, x - cradius); |
||||
const int ymax = min(h - 1, y + cradius); |
||||
const int xmax = min(w - 1, x + cradius); |
||||
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; |
||||
|
||||
const uchar* ic = img + y * img_step + channels * x; |
||||
|
||||
for(int yi = ymin; yi <= ymax; yi++) |
||||
{ |
||||
const T* disp_y = disp + yi * disp_step; |
||||
|
||||
for(int xi = xmin; xi <= xmax; xi++) |
||||
{ |
||||
const uchar* in = img + yi * img_step + channels * xi; |
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic); |
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; |
||||
|
||||
const T disp_reg = disp_y[xi]; |
||||
|
||||
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; |
||||
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; |
||||
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; |
||||
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; |
||||
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; |
||||
} |
||||
} |
||||
|
||||
float minimum = numeric_limits<float>::max(); |
||||
int id = 0; |
||||
|
||||
if (cost[0] < minimum) |
||||
{ |
||||
minimum = cost[0]; |
||||
id = 0; |
||||
} |
||||
if (cost[1] < minimum) |
||||
{ |
||||
minimum = cost[1]; |
||||
id = 1; |
||||
} |
||||
if (cost[2] < minimum) |
||||
{ |
||||
minimum = cost[2]; |
||||
id = 2; |
||||
} |
||||
if (cost[3] < minimum) |
||||
{ |
||||
minimum = cost[3]; |
||||
id = 3; |
||||
} |
||||
if (cost[4] < minimum) |
||||
{ |
||||
minimum = cost[4]; |
||||
id = 4; |
||||
} |
||||
|
||||
*(disp + y * disp_step + x) = dp[id]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace bf |
||||
{ |
||||
template <typename T> |
||||
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(32, 8, 1); |
||||
dim3 grid(1, 1, 1); |
||||
grid.x = divUp(disp.cols, threads.x << 1); |
||||
grid.y = divUp(disp.rows, threads.y); |
||||
|
||||
switch (channels) |
||||
{ |
||||
case 1: |
||||
for (int i = 0; i < iters; ++i) |
||||
{ |
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (int i = 0; i < iters; ++i) |
||||
{ |
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
break; |
||||
default: |
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); |
||||
} |
||||
|
||||
if (stream != 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
bilateral_filter_caller(disp, img, channels, iters, stream); |
||||
} |
||||
|
||||
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
bilateral_filter_caller(disp, img, channels, iters, stream); |
||||
} |
||||
}}} |
@ -1,980 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/vec_distance.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher |
||||
{ |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
////////////////////////////////////// Match ////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
template <int BLOCK_DIM_Y, typename T> |
||||
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) |
||||
{ |
||||
if (threadIdx.x == 0) |
||||
{ |
||||
smin[threadIdx.y] = myDist; |
||||
sIdx[threadIdx.y] = myIdx; |
||||
} |
||||
__syncthreads(); |
||||
|
||||
reducePredVal<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T>()); |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename T, typename Mask> |
||||
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) |
||||
{ |
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) |
||||
{ |
||||
if (m(queryIdx, trainIdx)) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
Dist dist; |
||||
|
||||
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
const typename Dist::result_type res = dist; |
||||
|
||||
if (res < myDist) |
||||
{ |
||||
myDist = res; |
||||
myIdx.x = trainIdx; |
||||
myIdx.y = imgIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <typename T> struct SingleTrain |
||||
{ |
||||
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_) |
||||
{ |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask> |
||||
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const |
||||
{ |
||||
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
|
||||
__device__ __forceinline__ int desc_len() const |
||||
{ |
||||
return train.cols; |
||||
} |
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, |
||||
float myDist, const int2& myIdx, int queryIdx) |
||||
{ |
||||
trainIdx[queryIdx] = myIdx.x; |
||||
distance[queryIdx] = myDist; |
||||
} |
||||
|
||||
const DevMem2D_<T> train; |
||||
}; |
||||
|
||||
template <typename T> struct TrainCollection |
||||
{ |
||||
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : |
||||
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) |
||||
{ |
||||
} |
||||
|
||||
template <typename Dist, typename VecDiff, typename Mask> |
||||
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const |
||||
{ |
||||
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) |
||||
{ |
||||
const DevMem2D_<T> train = trainCollection[imgIdx]; |
||||
m.next(); |
||||
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ int desc_len() const |
||||
{ |
||||
return desclen; |
||||
} |
||||
|
||||
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, |
||||
float myDist, const int2& myIdx, int queryIdx) |
||||
{ |
||||
trainIdx[queryIdx] = myIdx.x; |
||||
imgIdx[queryIdx] = myIdx.y; |
||||
distance[queryIdx] = myDist; |
||||
} |
||||
|
||||
const DevMem2D_<T>* trainCollection; |
||||
const int nImg; |
||||
const int desclen; |
||||
}; |
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask> |
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx, |
||||
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) |
||||
{ |
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); |
||||
|
||||
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; |
||||
|
||||
Mask m = mask; |
||||
|
||||
myIdx.x = -1; |
||||
myIdx.y = -1; |
||||
myDist = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train, typename Mask> |
||||
__global__ void match(const PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) |
||||
{ |
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
int2 myIdx; |
||||
typename Dist::result_type myDist; |
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem); |
||||
__syncthreads(); |
||||
|
||||
typename Dist::result_type* smin = smem; |
||||
int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); |
||||
|
||||
findBestMatch<BLOCK_DIM_Y>(myDist, myIdx, smin, sIdx); |
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0) |
||||
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask> |
||||
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask> |
||||
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp |
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length |
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Train, typename Mask> |
||||
void matchDispatcher(const DevMem2D_<T>& query, const Train& train, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (query.cols < 64) |
||||
{ |
||||
matchCached_caller<16, 16, 64, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 64) |
||||
{ |
||||
matchCached_caller<16, 16, 64, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 128) |
||||
{ |
||||
matchCached_caller<16, 16, 128, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 128 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 128, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 256 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 256, false, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 256 && cc >= 12) |
||||
{ |
||||
matchCached_caller<16, 16, 256, true, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
matchSimple_caller<16, 16, Dist>( |
||||
query, train, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Match caller |
||||
|
||||
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_)); |
||||
if (mask.data) |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); |
||||
else |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); |
||||
} |
||||
|
||||
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, |
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols); |
||||
if (maskCollection.data) |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); |
||||
else |
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); |
||||
} |
||||
|
||||
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
//////////////////////////////////// Knn Match //////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
template <typename VecDiff, typename Dist, typename T, typename Mask> |
||||
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx, |
||||
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, |
||||
typename Dist::result_type* smem) |
||||
{ |
||||
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); |
||||
|
||||
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; |
||||
|
||||
distMin1 = numeric_limits<typename Dist::result_type>::max(); |
||||
distMin2 = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
bestTrainIdx1 = -1; |
||||
bestTrainIdx2 = -1; |
||||
|
||||
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) |
||||
{ |
||||
if (m(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
const T* trainRow = train.ptr(trainIdx); |
||||
|
||||
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); |
||||
|
||||
const typename Dist::result_type val = dist; |
||||
|
||||
if (val < distMin1) |
||||
{ |
||||
distMin1 = val; |
||||
bestTrainIdx1 = trainIdx; |
||||
} |
||||
else if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask> |
||||
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance) |
||||
{ |
||||
typedef typename Dist::result_type result_type; |
||||
typedef typename Dist::value_type value_type; |
||||
|
||||
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
result_type distMin1; |
||||
result_type distMin2; |
||||
|
||||
int bestTrainIdx1; |
||||
int bestTrainIdx2; |
||||
|
||||
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); |
||||
__syncthreads(); |
||||
|
||||
volatile result_type* sdistMinRow = smem; |
||||
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
sdistMinRow[threadIdx.y] = distMin1; |
||||
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2; |
||||
|
||||
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; |
||||
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2; |
||||
} |
||||
__syncthreads(); |
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0) |
||||
{ |
||||
distMin1 = numeric_limits<result_type>::max(); |
||||
distMin2 = numeric_limits<result_type>::max(); |
||||
|
||||
bestTrainIdx1 = -1; |
||||
bestTrainIdx2 = -1; |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i < BLOCK_DIM_Y; ++i) |
||||
{ |
||||
result_type val = sdistMinRow[i]; |
||||
|
||||
if (val < distMin1) |
||||
{ |
||||
distMin1 = val; |
||||
bestTrainIdx1 = sbestTrainIdxRow[i]; |
||||
} |
||||
else if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = sbestTrainIdxRow[i]; |
||||
} |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) |
||||
{ |
||||
result_type val = sdistMinRow[i]; |
||||
|
||||
if (val < distMin2) |
||||
{ |
||||
distMin2 = val; |
||||
bestTrainIdx2 = sbestTrainIdxRow[i]; |
||||
} |
||||
} |
||||
|
||||
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2); |
||||
distance[queryIdx] = make_float2(distMin1, distMin2); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Knn 2 Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask> |
||||
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length |
||||
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX |
||||
|
||||
const dim3 grid(query.rows, 1, 1); |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
|
||||
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T> |
||||
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Knn 2 Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (query.cols < 64) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 64, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 64) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 64, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 128) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 128, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 128 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 128, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols < 256 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 256, false, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else if (query.cols == 256 && cc >= 12) |
||||
{ |
||||
knnMatch2Cached_caller<16, 16, 256, true, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
knnMatch2Simple_caller<16, 16, Dist>( |
||||
query, train, mask, |
||||
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Calc distance kernel |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance) |
||||
{ |
||||
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
const T* queryDescs = query.ptr(queryIdx); |
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; |
||||
|
||||
if (trainIdx < train.rows) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max(); |
||||
|
||||
if (mask(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
myDist = dist; |
||||
} |
||||
|
||||
if (threadIdx.x == 0) |
||||
distance.ptr(queryIdx)[trainIdx] = myDist; |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Calc distance kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); |
||||
|
||||
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) |
||||
{ |
||||
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// find knn match kernel |
||||
|
||||
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_) |
||||
{ |
||||
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; |
||||
__shared__ float sdist[SMEM_SIZE]; |
||||
__shared__ int strainIdx[SMEM_SIZE]; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
|
||||
float* allDist = allDist_.ptr(queryIdx); |
||||
int* trainIdx = trainIdx_.ptr(queryIdx); |
||||
float* distance = distance_.ptr(queryIdx); |
||||
|
||||
float dist = numeric_limits<float>::max(); |
||||
int bestIdx = -1; |
||||
|
||||
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) |
||||
{ |
||||
float reg = allDist[i]; |
||||
if (reg < dist) |
||||
{ |
||||
dist = reg; |
||||
bestIdx = i; |
||||
} |
||||
} |
||||
|
||||
sdist[threadIdx.x] = dist; |
||||
strainIdx[threadIdx.x] = bestIdx; |
||||
__syncthreads(); |
||||
|
||||
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>()); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
if (dist < numeric_limits<float>::max()) |
||||
{ |
||||
allDist[bestIdx] = numeric_limits<float>::max(); |
||||
trainIdx[i] = bestIdx; |
||||
distance[i] = dist; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// find knn match kernel caller |
||||
|
||||
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_SIZE, 1, 1); |
||||
const dim3 grid(trainIdx.rows, 1, 1); |
||||
|
||||
for (int i = 0; i < k; ++i) |
||||
{ |
||||
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) |
||||
{ |
||||
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// knn match Dispatcher |
||||
|
||||
template <typename Dist, typename T> |
||||
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
if (k == 2) |
||||
{ |
||||
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream); |
||||
return; |
||||
} |
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream); |
||||
} |
||||
else |
||||
{ |
||||
if (k == 2) |
||||
{ |
||||
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream); |
||||
return; |
||||
} |
||||
|
||||
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream); |
||||
} |
||||
|
||||
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// knn match caller |
||||
|
||||
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
|
||||
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, |
||||
int cc, cudaStream_t stream) |
||||
{ |
||||
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream); |
||||
} |
||||
|
||||
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////// Radius Match ////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask, |
||||
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 110 |
||||
|
||||
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; |
||||
|
||||
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; |
||||
|
||||
const int queryIdx = blockIdx.x; |
||||
const T* queryDescs = query.ptr(queryIdx); |
||||
|
||||
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; |
||||
|
||||
if (trainIdx < train.rows) |
||||
{ |
||||
const T* trainDescs = train.ptr(trainIdx); |
||||
|
||||
if (mask(queryIdx, trainIdx)) |
||||
{ |
||||
Dist dist; |
||||
|
||||
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
if (dist < maxDistance) |
||||
{ |
||||
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1); |
||||
if (i < trainIdx_.cols) |
||||
{ |
||||
distance.ptr(queryIdx)[i] = dist; |
||||
trainIdx_.ptr(queryIdx)[i] = trainIdx; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
#endif |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match kernel caller |
||||
|
||||
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> |
||||
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, |
||||
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); |
||||
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); |
||||
|
||||
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match Dispatcher |
||||
|
||||
template <typename Dist, typename T, typename Mask> |
||||
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, |
||||
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance), |
||||
stream); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////// |
||||
// Radius Match caller |
||||
|
||||
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
|
||||
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
|
||||
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, |
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, |
||||
cudaStream_t stream) |
||||
{ |
||||
if (mask.data) |
||||
{ |
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
else |
||||
{ |
||||
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), |
||||
trainIdx, nMatches, distance, |
||||
stream); |
||||
} |
||||
} |
||||
|
||||
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,240 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
#define MAX_KERNEL_SIZE 16 |
||||
#define BLOCK_DIM_X 16 |
||||
#define BLOCK_DIM_Y 16 |
||||
|
||||
namespace filter_krnls_column |
||||
{ |
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE]; |
||||
|
||||
void loadLinearKernel(const float kernel[], int ksize) |
||||
{ |
||||
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); |
||||
} |
||||
|
||||
template <int ksize, typename T, typename D, typename B> |
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) |
||||
{ |
||||
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; |
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; |
||||
|
||||
T* sDataColumn = smem + threadIdx.x; |
||||
|
||||
if (x < src.cols) |
||||
{ |
||||
const T* srcCol = src.ptr() + x; |
||||
|
||||
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); |
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); |
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < ksize; ++i) |
||||
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace filters |
||||
{ |
||||
template <int ksize, typename T, typename D, template<typename> class B> |
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
||||
|
||||
B<T> b(src.rows); |
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1)) |
||||
{ |
||||
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, " |
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); |
||||
} |
||||
|
||||
filter_krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, typename D> |
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); |
||||
static const caller_t callers[5][17] = |
||||
{ |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<16, T, D, BrdColConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>, |
||||
} |
||||
}; |
||||
|
||||
filter_krnls_column::loadLinearKernel(kernel, ksize); |
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); |
||||
} |
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
//template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
}}} |
@ -1,633 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
// Linear filters |
||||
|
||||
#define MAX_KERNEL_SIZE 16 |
||||
#define BLOCK_DIM_X 16 |
||||
#define BLOCK_DIM_Y 16 |
||||
|
||||
namespace filter_krnls |
||||
{ |
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE]; |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace filters |
||||
{ |
||||
void loadLinearKernel(const float kernel[], int ksize) |
||||
{ |
||||
cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) ); |
||||
} |
||||
}}} |
||||
|
||||
namespace filter_krnls |
||||
{ |
||||
template <typename T, size_t size> struct SmemType_ |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t; |
||||
}; |
||||
template <typename T> struct SmemType_<T, 4> |
||||
{ |
||||
typedef T smem_t; |
||||
}; |
||||
template <typename T> struct SmemType |
||||
{ |
||||
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t; |
||||
}; |
||||
|
||||
template <int ksize, typename T, typename D, typename B> |
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) |
||||
{ |
||||
typedef typename SmemType<T>::smem_t smem_t; |
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; |
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; |
||||
|
||||
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
const T* rowSrc = src.ptr(y); |
||||
|
||||
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); |
||||
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); |
||||
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (x < src.cols) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < ksize; ++i) |
||||
sum = sum + sDataRow[i] * cLinearKernel[i]; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace filters |
||||
{ |
||||
template <int ksize, typename T, typename D, template<typename> class B> |
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
||||
|
||||
typedef typename filter_krnls::SmemType<T>::smem_t smem_t; |
||||
B<smem_t> b(src.cols); |
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1)) |
||||
{ |
||||
cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, " |
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); |
||||
} |
||||
|
||||
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, typename D> |
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); |
||||
static const caller_t callers[5][17] = |
||||
{ |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<16, T, D, BrdRowConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<16, T, D, BrdRowWrap> |
||||
} |
||||
}; |
||||
|
||||
loadLinearKernel(kernel, ksize); |
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); |
||||
} |
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
}}} |
||||
|
||||
namespace filter_krnls |
||||
{ |
||||
template <int ksize, typename T, typename D, typename B> |
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) |
||||
{ |
||||
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; |
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; |
||||
|
||||
T* sDataColumn = smem + threadIdx.x; |
||||
|
||||
if (x < src.cols) |
||||
{ |
||||
const T* srcCol = src.ptr() + x; |
||||
|
||||
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); |
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); |
||||
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < ksize; ++i) |
||||
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace filters |
||||
{ |
||||
template <int ksize, typename T, typename D, template<typename> class B> |
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
||||
|
||||
B<T> b(src.rows); |
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1)) |
||||
{ |
||||
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, " |
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); |
||||
} |
||||
|
||||
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, typename D> |
||||
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); |
||||
static const caller_t callers[5][17] = |
||||
{ |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<16, T, D, BrdColConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>, |
||||
} |
||||
}; |
||||
|
||||
loadLinearKernel(kernel, ksize); |
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); |
||||
} |
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
}}} |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
// Bilateral filters |
||||
|
||||
namespace bf_krnls |
||||
{ |
||||
__constant__ float* ctable_color; |
||||
__constant__ float* ctable_space; |
||||
__constant__ size_t ctable_space_step; |
||||
|
||||
__constant__ int cndisp; |
||||
__constant__ int cradius; |
||||
|
||||
__constant__ short cedge_disc; |
||||
__constant__ short cmax_disc; |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace bf |
||||
{ |
||||
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) |
||||
{ |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); |
||||
size_t table_space_step = table_space.step / sizeof(float); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); |
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); |
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); |
||||
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); |
||||
} |
||||
}}} |
||||
|
||||
namespace bf_krnls |
||||
{ |
||||
template <int channels> |
||||
struct DistRgbMax |
||||
{ |
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) |
||||
{ |
||||
uchar x = abs(a[0] - b[0]); |
||||
uchar y = abs(a[1] - b[1]); |
||||
uchar z = abs(a[2] - b[2]); |
||||
return (max(max(x, y), z)); |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct DistRgbMax<1> |
||||
{ |
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) |
||||
{ |
||||
return abs(a[0] - b[0]); |
||||
} |
||||
}; |
||||
|
||||
template <int channels, typename T> |
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); |
||||
|
||||
T dp[5]; |
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1) |
||||
{ |
||||
dp[0] = *(disp + (y ) * disp_step + x + 0); |
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0); |
||||
dp[2] = *(disp + (y ) * disp_step + x - 1); |
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0); |
||||
dp[4] = *(disp + (y ) * disp_step + x + 1); |
||||
|
||||
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) |
||||
{ |
||||
const int ymin = max(0, y - cradius); |
||||
const int xmin = max(0, x - cradius); |
||||
const int ymax = min(h - 1, y + cradius); |
||||
const int xmax = min(w - 1, x + cradius); |
||||
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; |
||||
|
||||
const uchar* ic = img + y * img_step + channels * x; |
||||
|
||||
for(int yi = ymin; yi <= ymax; yi++) |
||||
{ |
||||
const T* disp_y = disp + yi * disp_step; |
||||
|
||||
for(int xi = xmin; xi <= xmax; xi++) |
||||
{ |
||||
const uchar* in = img + yi * img_step + channels * xi; |
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic); |
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; |
||||
|
||||
const T disp_reg = disp_y[xi]; |
||||
|
||||
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; |
||||
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; |
||||
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; |
||||
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; |
||||
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; |
||||
} |
||||
} |
||||
|
||||
float minimum = numeric_limits<float>::max(); |
||||
int id = 0; |
||||
|
||||
if (cost[0] < minimum) |
||||
{ |
||||
minimum = cost[0]; |
||||
id = 0; |
||||
} |
||||
if (cost[1] < minimum) |
||||
{ |
||||
minimum = cost[1]; |
||||
id = 1; |
||||
} |
||||
if (cost[2] < minimum) |
||||
{ |
||||
minimum = cost[2]; |
||||
id = 2; |
||||
} |
||||
if (cost[3] < minimum) |
||||
{ |
||||
minimum = cost[3]; |
||||
id = 3; |
||||
} |
||||
if (cost[4] < minimum) |
||||
{ |
||||
minimum = cost[4]; |
||||
id = 4; |
||||
} |
||||
|
||||
*(disp + y * disp_step + x) = dp[id]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace bf |
||||
{ |
||||
template <typename T> |
||||
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(32, 8, 1); |
||||
dim3 grid(1, 1, 1); |
||||
grid.x = divUp(disp.cols, threads.x << 1); |
||||
grid.y = divUp(disp.rows, threads.y); |
||||
|
||||
switch (channels) |
||||
{ |
||||
case 1: |
||||
for (int i = 0; i < iters; ++i) |
||||
{ |
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (int i = 0; i < iters; ++i) |
||||
{ |
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
break; |
||||
default: |
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); |
||||
} |
||||
|
||||
if (stream != 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
bilateral_filter_caller(disp, img, channels, iters, stream); |
||||
} |
||||
|
||||
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) |
||||
{ |
||||
bilateral_filter_caller(disp, img, channels, iters, stream); |
||||
} |
||||
}}} |
@ -0,0 +1,185 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
#include "opencv2/gpu/device/vec_traits.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace imgproc |
||||
{ |
||||
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; |
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
const int y = blockIdx.y; |
||||
|
||||
__shared__ value_type smem[256 + 4]; |
||||
|
||||
value_type sum; |
||||
|
||||
const int src_y = 2*y; |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step); |
||||
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step); |
||||
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step); |
||||
|
||||
smem[2 + threadIdx.x] = sum; |
||||
|
||||
if (threadIdx.x < 2) |
||||
{ |
||||
const int left_x = x - 2 + threadIdx.x; |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step); |
||||
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step); |
||||
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step); |
||||
|
||||
smem[threadIdx.x] = sum; |
||||
} |
||||
|
||||
if (threadIdx.x > 253) |
||||
{ |
||||
const int right_x = x + threadIdx.x + 2; |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step); |
||||
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step); |
||||
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step); |
||||
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step); |
||||
|
||||
smem[4 + threadIdx.x] = sum; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (threadIdx.x < 128) |
||||
{ |
||||
const int tid2 = threadIdx.x * 2; |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + 0.0625f * smem[2 + tid2 - 2]; |
||||
sum = sum + 0.25f * smem[2 + tid2 - 1]; |
||||
sum = sum + 0.375f * smem[2 + tid2 ]; |
||||
sum = sum + 0.25f * smem[2 + tid2 + 1]; |
||||
sum = sum + 0.0625f * smem[2 + tid2 + 2]; |
||||
|
||||
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2; |
||||
|
||||
if (dst_x < dst_cols) |
||||
dst.ptr(y)[dst_x] = saturate_cast<T>(sum); |
||||
} |
||||
} |
||||
|
||||
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream) |
||||
{ |
||||
const dim3 block(256); |
||||
const dim3 grid(divUp(src.cols, block.x), dst.rows); |
||||
|
||||
B<T> b(src.rows, src.cols); |
||||
|
||||
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream) |
||||
{ |
||||
typedef typename TypeVec<T, cn>::vec_type type; |
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[] = |
||||
{ |
||||
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap> |
||||
}; |
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream); |
||||
} |
||||
|
||||
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,180 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
#include "opencv2/gpu/device/vec_traits.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace imgproc |
||||
{ |
||||
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; |
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
|
||||
__shared__ T smem1[10][10]; |
||||
__shared__ value_type smem2[20][16]; |
||||
|
||||
value_type sum; |
||||
|
||||
if (threadIdx.x < 10 && threadIdx.y < 10) |
||||
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step); |
||||
|
||||
__syncthreads(); |
||||
|
||||
const int tidx = threadIdx.x; |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)]; |
||||
|
||||
smem2[2 + threadIdx.y][tidx] = sum; |
||||
|
||||
if (threadIdx.y < 2) |
||||
{ |
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)]; |
||||
|
||||
smem2[threadIdx.y][tidx] = sum; |
||||
} |
||||
|
||||
if (threadIdx.y > 13) |
||||
{ |
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)]; |
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)]; |
||||
|
||||
smem2[4 + threadIdx.y][tidx] = sum; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
sum = VecTraits<value_type>::all(0); |
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx]; |
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx]; |
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx]; |
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx]; |
||||
|
||||
if (x < dst.cols && y < dst.rows) |
||||
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum); |
||||
} |
||||
|
||||
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream) |
||||
{ |
||||
const dim3 block(16, 16); |
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
B<T> b(src.rows, src.cols); |
||||
|
||||
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream) |
||||
{ |
||||
typedef typename TypeVec<T, cn>::vec_type type; |
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[] = |
||||
{ |
||||
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap> |
||||
}; |
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream); |
||||
} |
||||
|
||||
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
|
||||
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,249 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
#include "opencv2/gpu/device/vec_traits.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/filters.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace imgproc |
||||
{ |
||||
|
||||
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst) |
||||
{ |
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x; |
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y; |
||||
|
||||
if (x < dst.cols && y < dst.rows) |
||||
{ |
||||
const float xcoo = mapx.ptr(y)[x]; |
||||
const float ycoo = mapy.ptr(y)[x]; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo)); |
||||
} |
||||
} |
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; |
||||
|
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); |
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd); |
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc); |
||||
|
||||
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
}; |
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; |
||||
|
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); |
||||
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd); |
||||
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc); |
||||
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
}; |
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \ |
||||
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ |
||||
struct tex_remap_ ## type ## _reader \ |
||||
{ \ |
||||
typedef type elem_type; \ |
||||
typedef int index_type; \ |
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ |
||||
{ \ |
||||
return tex2D(tex_remap_ ## type , x, y); \ |
||||
} \ |
||||
}; \ |
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \ |
||||
{ \ |
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \ |
||||
{ \ |
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \ |
||||
dim3 block(32, 8); \ |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ |
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \ |
||||
tex_remap_ ## type ##_reader texSrc; \ |
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \ |
||||
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \ |
||||
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \ |
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \ |
||||
cudaSafeCall( cudaGetLastError() ); \ |
||||
cudaSafeCall( cudaDeviceSynchronize() ); \ |
||||
} \ |
||||
}; \ |
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \ |
||||
{ \ |
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \ |
||||
{ \ |
||||
dim3 block(32, 8); \ |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ |
||||
TextureBinder texHandler(&tex_remap_ ## type , src); \ |
||||
tex_remap_ ## type ##_reader texSrc; \ |
||||
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \ |
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \ |
||||
cudaSafeCall( cudaGetLastError() ); \ |
||||
cudaSafeCall( cudaDeviceSynchronize() ); \ |
||||
} \ |
||||
}; |
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2) |
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4) |
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2) |
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2) |
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4) |
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float) |
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2) |
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4) |
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX |
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream) |
||||
{ |
||||
if (stream == 0) |
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue); |
||||
else |
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream); |
||||
} |
||||
}; |
||||
|
||||
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[3][5] = |
||||
{ |
||||
{ |
||||
RemapDispatcher<PointFilter, BrdReflect101, T>::call, |
||||
RemapDispatcher<PointFilter, BrdReplicate, T>::call, |
||||
RemapDispatcher<PointFilter, BrdConstant, T>::call, |
||||
RemapDispatcher<PointFilter, BrdReflect, T>::call, |
||||
RemapDispatcher<PointFilter, BrdWrap, T>::call |
||||
}, |
||||
{ |
||||
RemapDispatcher<LinearFilter, BrdReflect101, T>::call, |
||||
RemapDispatcher<LinearFilter, BrdReplicate, T>::call, |
||||
RemapDispatcher<LinearFilter, BrdConstant, T>::call, |
||||
RemapDispatcher<LinearFilter, BrdReflect, T>::call, |
||||
RemapDispatcher<LinearFilter, BrdWrap, T>::call |
||||
}, |
||||
{ |
||||
RemapDispatcher<CubicFilter, BrdReflect101, T>::call, |
||||
RemapDispatcher<CubicFilter, BrdReplicate, T>::call, |
||||
RemapDispatcher<CubicFilter, BrdConstant, T>::call, |
||||
RemapDispatcher<CubicFilter, BrdReflect, T>::call, |
||||
RemapDispatcher<CubicFilter, BrdWrap, T>::call |
||||
} |
||||
}; |
||||
|
||||
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream); |
||||
} |
||||
|
||||
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
|
||||
//template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
|
||||
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
|
||||
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
|
||||
//template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
|
||||
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
//template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,264 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
#include "opencv2/gpu/device/vec_traits.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/filters.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace cv { namespace gpu { namespace imgproc |
||||
{ |
||||
|
||||
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst) |
||||
{ |
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x; |
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y; |
||||
|
||||
if (x < dst.cols && y < dst.rows) |
||||
{ |
||||
const float xcoo = x / fx; |
||||
const float ycoo = y / fy; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo)); |
||||
} |
||||
} |
||||
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst) |
||||
{ |
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x; |
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y; |
||||
|
||||
if (x < dst.cols && y < dst.rows) |
||||
{ |
||||
const float xcoo = x / fx; |
||||
const float ycoo = y / fy; |
||||
|
||||
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo)); |
||||
} |
||||
} |
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols); |
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd); |
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc); |
||||
|
||||
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
}; |
||||
template <typename T> struct ResizeDispatcherStream<PointFilter, T> |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols); |
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd); |
||||
|
||||
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
} |
||||
}; |
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols); |
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd); |
||||
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc); |
||||
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
}; |
||||
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T> |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); |
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols); |
||||
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd); |
||||
|
||||
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
}; |
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \ |
||||
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ |
||||
struct tex_resize_ ## type ## _reader \ |
||||
{ \ |
||||
typedef type elem_type; \ |
||||
typedef int index_type; \ |
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ |
||||
{ \ |
||||
return tex2D(tex_resize_ ## type , x, y); \ |
||||
} \ |
||||
}; \ |
||||
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \ |
||||
{ \ |
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \ |
||||
{ \ |
||||
dim3 block(32, 8); \ |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ |
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \ |
||||
tex_resize_ ## type ##_reader texSrc; \ |
||||
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \ |
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); \ |
||||
cudaSafeCall( cudaGetLastError() ); \ |
||||
cudaSafeCall( cudaDeviceSynchronize() ); \ |
||||
} \ |
||||
}; \ |
||||
template <> struct ResizeDispatcherNonStream<PointFilter, type> \ |
||||
{ \ |
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \ |
||||
{ \ |
||||
dim3 block(32, 8); \ |
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ |
||||
TextureBinder texHandler(&tex_resize_ ## type , src); \ |
||||
tex_resize_ ## type ##_reader texSrc; \ |
||||
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \ |
||||
cudaSafeCall( cudaGetLastError() ); \ |
||||
cudaSafeCall( cudaDeviceSynchronize() ); \ |
||||
} \ |
||||
}; |
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2) |
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4) |
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2) |
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2) |
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4) |
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4) |
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float) |
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2) |
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4) |
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX |
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcher |
||||
{ |
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) |
||||
{ |
||||
if (stream == 0) |
||||
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst); |
||||
else |
||||
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream); |
||||
} |
||||
}; |
||||
|
||||
template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[3] = |
||||
{ |
||||
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call |
||||
}; |
||||
|
||||
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream); |
||||
} |
||||
|
||||
template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
|
||||
//template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
|
||||
template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
|
||||
template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
|
||||
//template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
|
||||
template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
//template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,256 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
#define MAX_KERNEL_SIZE 16 |
||||
#define BLOCK_DIM_X 16 |
||||
#define BLOCK_DIM_Y 16 |
||||
|
||||
namespace filter_krnls_row |
||||
{ |
||||
__constant__ float cLinearKernel[MAX_KERNEL_SIZE]; |
||||
|
||||
void loadLinearKernel(const float kernel[], int ksize) |
||||
{ |
||||
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); |
||||
} |
||||
|
||||
template <typename T, size_t size> struct SmemType_ |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t; |
||||
}; |
||||
template <typename T> struct SmemType_<T, 4> |
||||
{ |
||||
typedef T smem_t; |
||||
}; |
||||
template <typename T> struct SmemType |
||||
{ |
||||
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t; |
||||
}; |
||||
|
||||
template <int ksize, typename T, typename D, typename B> |
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) |
||||
{ |
||||
typedef typename SmemType<T>::smem_t smem_t; |
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; |
||||
|
||||
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; |
||||
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; |
||||
|
||||
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
const T* rowSrc = src.ptr(y); |
||||
|
||||
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); |
||||
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); |
||||
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (x < src.cols) |
||||
{ |
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor; |
||||
|
||||
#pragma unroll |
||||
for(int i = 0; i < ksize; ++i) |
||||
sum = sum + sDataRow[i] * cLinearKernel[i]; |
||||
|
||||
dst.ptr(y)[x] = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
namespace cv { namespace gpu { namespace filters |
||||
{ |
||||
template <int ksize, typename T, typename D, template<typename> class B> |
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) |
||||
{ |
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); |
||||
|
||||
typedef typename filter_krnls_row::SmemType<T>::smem_t smem_t; |
||||
B<smem_t> b(src.cols); |
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1)) |
||||
{ |
||||
cv::gpu::error("linearRowFilter: can't use specified border extrapolation, image is too small, " |
||||
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); |
||||
} |
||||
|
||||
filter_krnls_row::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, typename D> |
||||
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); |
||||
static const caller_t callers[5][17] = |
||||
{ |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>, |
||||
linearRowFilter_caller<16, T, D, BrdRowConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>, |
||||
linearRowFilter_caller<16, T, D, BrdRowReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>, |
||||
linearRowFilter_caller<16, T, D, BrdRowWrap> |
||||
} |
||||
}; |
||||
|
||||
filter_krnls_row::loadLinearKernel(kernel, ksize); |
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); |
||||
} |
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
//template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
//template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); |
||||
}}} |
@ -0,0 +1,117 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ |
||||
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ |
||||
|
||||
#include "../datamov_utils.hpp" |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
namespace detail |
||||
{ |
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached |
||||
{ |
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind) |
||||
{ |
||||
if (ind < len) |
||||
{ |
||||
T1 val1 = *vecCached++; |
||||
|
||||
T2 val2; |
||||
ForceGlob<T2>::Load(vecGlob, ind, val2); |
||||
|
||||
dist.reduceIter(val1, val2); |
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM); |
||||
} |
||||
} |
||||
|
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist) |
||||
{ |
||||
T1 val1 = *vecCached++; |
||||
|
||||
T2 val2; |
||||
ForceGlob<T2>::Load(vecGlob, 0, val2); |
||||
vecGlob += THREAD_DIM; |
||||
|
||||
dist.reduceIter(val1, val2); |
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist); |
||||
} |
||||
}; |
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0> |
||||
{ |
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int) |
||||
{ |
||||
} |
||||
|
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&) |
||||
{ |
||||
} |
||||
}; |
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator; |
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false> |
||||
{ |
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid) |
||||
{ |
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid); |
||||
} |
||||
}; |
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true> |
||||
{ |
||||
template <typename Dist, typename T1, typename T2> |
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid) |
||||
{ |
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist); |
||||
} |
||||
}; |
||||
} |
||||
}}} |
||||
|
||||
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
|
Loading…
Reference in new issue