commit
15e7712a26
201 changed files with 25012 additions and 14626 deletions
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,10 @@ |
||||
cmake_minimum_required(VERSION 2.8.3) |
||||
|
||||
project(nv_perf_test) |
||||
|
||||
find_package(OpenCV REQUIRED) |
||||
include_directories(${OpenCV_INCLUDE_DIR}) |
||||
|
||||
add_executable(${PROJECT_NAME} main.cpp) |
||||
|
||||
target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS}) |
After Width: | Height: | Size: 140 KiB |
After Width: | Height: | Size: 140 KiB |
@ -0,0 +1,489 @@ |
||||
#include <cstdio> |
||||
#define HAVE_CUDA 1 |
||||
#include <opencv2/core/core.hpp> |
||||
#include <opencv2/gpu/gpu.hpp> |
||||
#include <opencv2/highgui/highgui.hpp> |
||||
#include <opencv2/video/video.hpp> |
||||
#include <opencv2/legacy/legacy.hpp> |
||||
#include <opencv2/ts/ts.hpp> |
||||
#include <opencv2/ts/ts_perf.hpp> |
||||
|
||||
static void printOsInfo() |
||||
{ |
||||
#if defined _WIN32 |
||||
# if defined _WIN64 |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout); |
||||
# else |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout); |
||||
# endif |
||||
#elif defined linux |
||||
# if defined _LP64 |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout); |
||||
# else |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout); |
||||
# endif |
||||
#elif defined __APPLE__ |
||||
# if defined _LP64 |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout); |
||||
# else |
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout); |
||||
# endif |
||||
#endif |
||||
} |
||||
|
||||
static void printCudaInfo() |
||||
{ |
||||
const int deviceCount = cv::gpu::getCudaEnabledDeviceCount(); |
||||
|
||||
printf("[----------]\n"); fflush(stdout); |
||||
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout); |
||||
printf("[----------]\n"); fflush(stdout); |
||||
|
||||
for (int i = 0; i < deviceCount; ++i) |
||||
{ |
||||
cv::gpu::DeviceInfo info(i); |
||||
|
||||
printf("[----------]\n"); fflush(stdout); |
||||
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout); |
||||
printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout); |
||||
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout); |
||||
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout); |
||||
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout); |
||||
if (!info.isCompatible()) |
||||
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n"); |
||||
printf("[----------]\n"); fflush(stdout); |
||||
} |
||||
} |
||||
|
||||
int main(int argc, char* argv[]) |
||||
{ |
||||
printOsInfo(); |
||||
printCudaInfo(); |
||||
|
||||
perf::Regression::Init("nv_perf_test"); |
||||
perf::TestBase::Init(argc, argv); |
||||
testing::InitGoogleTest(&argc, argv); |
||||
|
||||
return RUN_ALL_TESTS(); |
||||
} |
||||
|
||||
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name |
||||
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name |
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// HoughLinesP
|
||||
|
||||
DEF_PARAM_TEST_1(Image, std::string); |
||||
|
||||
PERF_TEST_P(Image, HoughLinesP, |
||||
testing::Values(std::string("im1_1280x800.jpg"))) |
||||
{ |
||||
declare.time(30.0); |
||||
|
||||
std::string fileName = GetParam(); |
||||
|
||||
const double rho = 1.0; |
||||
const double theta = 1.0; |
||||
const int threshold = 40; |
||||
const int minLineLenght = 20; |
||||
const int maxLineGap = 5; |
||||
|
||||
cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE); |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GpuMat d_image(image); |
||||
cv::gpu::GpuMat d_lines; |
||||
cv::gpu::HoughLinesBuf d_buf; |
||||
|
||||
cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cv::Mat mask; |
||||
cv::Canny(image, mask, 50, 100); |
||||
|
||||
std::vector<cv::Vec4i> lines; |
||||
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// GoodFeaturesToTrack
|
||||
|
||||
DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth); |
||||
|
||||
PERF_TEST_P(Image_Depth, GoodFeaturesToTrack, |
||||
testing::Combine( |
||||
testing::Values(std::string("im1_1280x800.jpg")), |
||||
testing::Values(CV_8U, CV_16U) |
||||
)) |
||||
{ |
||||
declare.time(60); |
||||
|
||||
const std::string fileName = std::tr1::get<0>(GetParam()); |
||||
const int depth = std::tr1::get<1>(GetParam()); |
||||
|
||||
const int maxCorners = 5000; |
||||
const double qualityLevel = 0.05; |
||||
const int minDistance = 5; |
||||
const int blockSize = 3; |
||||
const bool useHarrisDetector = true; |
||||
const double k = 0.05; |
||||
|
||||
cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE); |
||||
if (src.empty()) |
||||
FAIL() << "Unable to load source image [" << fileName << "]"; |
||||
|
||||
if (depth != CV_8U) |
||||
src.convertTo(src, depth); |
||||
|
||||
cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1)); |
||||
mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0)); |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k); |
||||
|
||||
cv::gpu::GpuMat d_src(src); |
||||
cv::gpu::GpuMat d_mask(mask); |
||||
cv::gpu::GpuMat d_pts; |
||||
|
||||
d_detector(d_src, d_pts, d_mask); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
d_detector(d_src, d_pts, d_mask); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if (depth != CV_8U) |
||||
FAIL() << "Unsupported depth"; |
||||
|
||||
cv::Mat pts; |
||||
|
||||
cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// OpticalFlowPyrLKSparse
|
||||
|
||||
typedef std::pair<std::string, std::string> string_pair; |
||||
|
||||
DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool); |
||||
|
||||
PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse, |
||||
testing::Combine( |
||||
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), |
||||
testing::Values(CV_8U, CV_16U), |
||||
testing::Bool() |
||||
)) |
||||
{ |
||||
declare.time(60); |
||||
|
||||
const string_pair fileNames = std::tr1::get<0>(GetParam()); |
||||
const int depth = std::tr1::get<1>(GetParam()); |
||||
const bool graySource = std::tr1::get<2>(GetParam()); |
||||
|
||||
// PyrLK params
|
||||
const cv::Size winSize(15, 15); |
||||
const int maxLevel = 5; |
||||
const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01); |
||||
|
||||
// GoodFeaturesToTrack params
|
||||
const int maxCorners = 5000; |
||||
const double qualityLevel = 0.05; |
||||
const int minDistance = 5; |
||||
const int blockSize = 3; |
||||
const bool useHarrisDetector = true; |
||||
const double k = 0.05; |
||||
|
||||
cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); |
||||
if (src1.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.first << "]"; |
||||
|
||||
cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); |
||||
if (src2.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.second << "]"; |
||||
|
||||
cv::Mat gray_src; |
||||
if (graySource) |
||||
gray_src = src1; |
||||
else |
||||
cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY); |
||||
|
||||
cv::Mat pts; |
||||
cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k); |
||||
|
||||
if (depth != CV_8U) |
||||
{ |
||||
src1.convertTo(src1, depth); |
||||
src2.convertTo(src2, depth); |
||||
} |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GpuMat d_src1(src1); |
||||
cv::gpu::GpuMat d_src2(src2); |
||||
cv::gpu::GpuMat d_pts(pts.reshape(2, 1)); |
||||
cv::gpu::GpuMat d_nextPts; |
||||
cv::gpu::GpuMat d_status; |
||||
|
||||
cv::gpu::PyrLKOpticalFlow d_pyrLK; |
||||
d_pyrLK.winSize = winSize; |
||||
d_pyrLK.maxLevel = maxLevel; |
||||
d_pyrLK.iters = criteria.maxCount; |
||||
d_pyrLK.useInitialFlow = false; |
||||
|
||||
d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if (depth != CV_8U) |
||||
FAIL() << "Unsupported depth"; |
||||
|
||||
cv::Mat nextPts; |
||||
cv::Mat status; |
||||
|
||||
cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// OpticalFlowFarneback
|
||||
|
||||
DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth); |
||||
|
||||
PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback, |
||||
testing::Combine( |
||||
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), |
||||
testing::Values(CV_8U, CV_16U) |
||||
)) |
||||
{ |
||||
declare.time(500); |
||||
|
||||
const string_pair fileNames = std::tr1::get<0>(GetParam()); |
||||
const int depth = std::tr1::get<1>(GetParam()); |
||||
|
||||
const double pyrScale = 0.5; |
||||
const int numLevels = 6; |
||||
const int winSize = 7; |
||||
const int numIters = 15; |
||||
const int polyN = 7; |
||||
const double polySigma = 1.5; |
||||
const int flags = cv::OPTFLOW_USE_INITIAL_FLOW; |
||||
|
||||
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); |
||||
if (src1.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.first << "]"; |
||||
|
||||
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); |
||||
if (src2.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.second << "]"; |
||||
|
||||
if (depth != CV_8U) |
||||
{ |
||||
src1.convertTo(src1, depth); |
||||
src2.convertTo(src2, depth); |
||||
} |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GpuMat d_src1(src1); |
||||
cv::gpu::GpuMat d_src2(src2); |
||||
cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0)); |
||||
cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0)); |
||||
|
||||
cv::gpu::FarnebackOpticalFlow d_farneback; |
||||
d_farneback.pyrScale = pyrScale; |
||||
d_farneback.numLevels = numLevels; |
||||
d_farneback.winSize = winSize; |
||||
d_farneback.numIters = numIters; |
||||
d_farneback.polyN = polyN; |
||||
d_farneback.polySigma = polySigma; |
||||
d_farneback.flags = flags; |
||||
|
||||
d_farneback(d_src1, d_src2, d_u, d_v); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
d_farneback(d_src1, d_src2, d_u, d_v); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if (depth != CV_8U) |
||||
FAIL() << "Unsupported depth"; |
||||
|
||||
cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0)); |
||||
|
||||
cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// OpticalFlowBM
|
||||
|
||||
void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr, |
||||
cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious, |
||||
cv::Mat& velx, cv::Mat& vely) |
||||
{ |
||||
cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height); |
||||
|
||||
velx.create(sz, CV_32FC1); |
||||
vely.create(sz, CV_32FC1); |
||||
|
||||
CvMat cvprev = prev; |
||||
CvMat cvcurr = curr; |
||||
|
||||
CvMat cvvelx = velx; |
||||
CvMat cvvely = vely; |
||||
|
||||
cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely); |
||||
} |
||||
|
||||
DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size); |
||||
|
||||
PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM, |
||||
testing::Combine( |
||||
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), |
||||
testing::Values(cv::Size(16, 16)), |
||||
testing::Values(cv::Size(2, 2)), |
||||
testing::Values(cv::Size(16, 16)) |
||||
)) |
||||
{ |
||||
declare.time(1000); |
||||
|
||||
const string_pair fileNames = std::tr1::get<0>(GetParam()); |
||||
const cv::Size block_size = std::tr1::get<1>(GetParam()); |
||||
const cv::Size shift_size = std::tr1::get<2>(GetParam()); |
||||
const cv::Size max_range = std::tr1::get<3>(GetParam()); |
||||
|
||||
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); |
||||
if (src1.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.first << "]"; |
||||
|
||||
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); |
||||
if (src2.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.second << "]"; |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GpuMat d_src1(src1); |
||||
cv::gpu::GpuMat d_src2(src2); |
||||
cv::gpu::GpuMat d_velx, d_vely, buf; |
||||
|
||||
cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cv::Mat velx, vely; |
||||
|
||||
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
||||
|
||||
PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM, |
||||
testing::Combine( |
||||
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")), |
||||
testing::Values(cv::Size(16, 16)), |
||||
testing::Values(cv::Size(1, 1)), |
||||
testing::Values(cv::Size(16, 16)) |
||||
)) |
||||
{ |
||||
declare.time(1000); |
||||
|
||||
const string_pair fileNames = std::tr1::get<0>(GetParam()); |
||||
const cv::Size block_size = std::tr1::get<1>(GetParam()); |
||||
const cv::Size shift_size = std::tr1::get<2>(GetParam()); |
||||
const cv::Size max_range = std::tr1::get<3>(GetParam()); |
||||
|
||||
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE); |
||||
if (src1.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.first << "]"; |
||||
|
||||
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE); |
||||
if (src2.empty()) |
||||
FAIL() << "Unable to load source image [" << fileNames.second << "]"; |
||||
|
||||
if (PERF_RUN_GPU()) |
||||
{ |
||||
cv::gpu::GpuMat d_src1(src1); |
||||
cv::gpu::GpuMat d_src2(src2); |
||||
cv::gpu::GpuMat d_velx, d_vely; |
||||
|
||||
cv::gpu::FastOpticalFlowBM fastBM; |
||||
|
||||
fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cv::Mat velx, vely; |
||||
|
||||
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); |
||||
|
||||
TEST_CYCLE_N(10) |
||||
{ |
||||
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely); |
||||
} |
||||
} |
||||
|
||||
SANITY_CHECK(0); |
||||
} |
@ -0,0 +1,361 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__ |
||||
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__ |
||||
|
||||
#include <thrust/tuple.h> |
||||
#include "../warp.hpp" |
||||
#include "../warp_shuffle.hpp" |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
namespace reduce_detail |
||||
{ |
||||
template <typename T> struct GetType; |
||||
template <typename T> struct GetType<T*> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
template <typename T> struct GetType<volatile T*> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
template <typename T> struct GetType<T&> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
|
||||
template <unsigned int I, unsigned int N> |
||||
struct For |
||||
{ |
||||
template <class PointerTuple, class ValTuple> |
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) |
||||
{ |
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val); |
||||
|
||||
For<I + 1, N>::loadToSmem(smem, val, tid); |
||||
} |
||||
template <class PointerTuple, class ValTuple> |
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) |
||||
{ |
||||
thrust::get<I>(val) = thrust::get<I>(smem)[tid]; |
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, val, tid); |
||||
} |
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple> |
||||
static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) |
||||
{ |
||||
typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta]; |
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg); |
||||
|
||||
For<I + 1, N>::merge(smem, val, tid, delta, op); |
||||
} |
||||
template <class ValTuple, class OpTuple> |
||||
static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) |
||||
{ |
||||
typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width); |
||||
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg); |
||||
|
||||
For<I + 1, N>::mergeShfl(val, delta, width, op); |
||||
} |
||||
}; |
||||
template <unsigned int N> |
||||
struct For<N, N> |
||||
{ |
||||
template <class PointerTuple, class ValTuple> |
||||
static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) |
||||
{ |
||||
} |
||||
template <class PointerTuple, class ValTuple> |
||||
static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) |
||||
{ |
||||
} |
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple> |
||||
static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) |
||||
{ |
||||
} |
||||
template <class ValTuple, class OpTuple> |
||||
static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) |
||||
{ |
||||
} |
||||
}; |
||||
|
||||
template <typename T> |
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) |
||||
{ |
||||
smem[tid] = val; |
||||
} |
||||
template <typename T> |
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) |
||||
{ |
||||
val = smem[tid]; |
||||
} |
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, |
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9> |
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, |
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, |
||||
unsigned int tid) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid); |
||||
} |
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, |
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9> |
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, |
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, |
||||
unsigned int tid) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid); |
||||
} |
||||
|
||||
template <typename T, class Op> |
||||
__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) |
||||
{ |
||||
T reg = smem[tid + delta]; |
||||
smem[tid] = val = op(val, reg); |
||||
} |
||||
template <typename T, class Op> |
||||
__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) |
||||
{ |
||||
T reg = shfl_down(val, delta, width); |
||||
val = op(val, reg); |
||||
} |
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, |
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9, |
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9> |
||||
__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, |
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, |
||||
unsigned int tid, |
||||
unsigned int delta, |
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op); |
||||
} |
||||
template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9, |
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9> |
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, |
||||
unsigned int delta, |
||||
unsigned int width, |
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op); |
||||
} |
||||
|
||||
template <unsigned int N> struct Generic |
||||
{ |
||||
template <typename Pointer, typename Reference, class Op> |
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) |
||||
{ |
||||
loadToSmem(smem, val, tid); |
||||
if (N >= 32) |
||||
__syncthreads(); |
||||
|
||||
if (N >= 2048) |
||||
{ |
||||
if (tid < 1024) |
||||
merge(smem, val, tid, 1024, op); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 1024) |
||||
{ |
||||
if (tid < 512) |
||||
merge(smem, val, tid, 512, op); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 512) |
||||
{ |
||||
if (tid < 256) |
||||
merge(smem, val, tid, 256, op); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 256) |
||||
{ |
||||
if (tid < 128) |
||||
merge(smem, val, tid, 128, op); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 128) |
||||
{ |
||||
if (tid < 64) |
||||
merge(smem, val, tid, 64, op); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 64) |
||||
{ |
||||
if (tid < 32) |
||||
merge(smem, val, tid, 32, op); |
||||
} |
||||
|
||||
if (tid < 16) |
||||
{ |
||||
merge(smem, val, tid, 16, op); |
||||
merge(smem, val, tid, 8, op); |
||||
merge(smem, val, tid, 4, op); |
||||
merge(smem, val, tid, 2, op); |
||||
merge(smem, val, tid, 1, op); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int I, typename Pointer, typename Reference, class Op> |
||||
struct Unroll |
||||
{ |
||||
static __device__ void loopShfl(Reference val, Op op, unsigned int N) |
||||
{ |
||||
mergeShfl(val, I, N, op); |
||||
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N); |
||||
} |
||||
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) |
||||
{ |
||||
merge(smem, val, tid, I, op); |
||||
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); |
||||
} |
||||
}; |
||||
template <typename Pointer, typename Reference, class Op> |
||||
struct Unroll<0, Pointer, Reference, Op> |
||||
{ |
||||
static __device__ void loopShfl(Reference, Op, unsigned int) |
||||
{ |
||||
} |
||||
static __device__ void loop(Pointer, Reference, unsigned int, Op) |
||||
{ |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int N> struct WarpOptimized |
||||
{ |
||||
template <typename Pointer, typename Reference, class Op> |
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
(void) smem; |
||||
(void) tid; |
||||
|
||||
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N); |
||||
#else |
||||
loadToSmem(smem, val, tid); |
||||
|
||||
if (tid < N / 2) |
||||
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); |
||||
#endif |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int N> struct GenericOptimized32 |
||||
{ |
||||
enum { M = N / 32 }; |
||||
|
||||
template <typename Pointer, typename Reference, class Op> |
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) |
||||
{ |
||||
const unsigned int laneId = Warp::laneId(); |
||||
|
||||
#if __CUDA_ARCH__ >= 300 |
||||
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); |
||||
|
||||
if (laneId == 0) |
||||
loadToSmem(smem, val, tid / 32); |
||||
#else |
||||
loadToSmem(smem, val, tid); |
||||
|
||||
if (laneId < 16) |
||||
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (laneId == 0) |
||||
loadToSmem(smem, val, tid / 32); |
||||
#endif |
||||
|
||||
__syncthreads(); |
||||
|
||||
loadFromSmem(smem, val, tid); |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M); |
||||
#else |
||||
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); |
||||
#endif |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf; |
||||
template <class T1, class T2> struct StaticIf<true, T1, T2> |
||||
{ |
||||
typedef T1 type; |
||||
}; |
||||
template <class T1, class T2> struct StaticIf<false, T1, T2> |
||||
{ |
||||
typedef T2 type; |
||||
}; |
||||
|
||||
template <unsigned int N> struct IsPowerOf2 |
||||
{ |
||||
enum { value = ((N != 0) && !(N & (N - 1))) }; |
||||
}; |
||||
|
||||
template <unsigned int N> struct Dispatcher |
||||
{ |
||||
typedef typename StaticIf< |
||||
(N <= 32) && IsPowerOf2<N>::value, |
||||
WarpOptimized<N>, |
||||
typename StaticIf< |
||||
(N <= 1024) && IsPowerOf2<N>::value, |
||||
GenericOptimized32<N>, |
||||
Generic<N> |
||||
>::type |
||||
>::type reductor; |
||||
}; |
||||
} |
||||
}}} |
||||
|
||||
#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__
|
@ -0,0 +1,498 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ |
||||
#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ |
||||
|
||||
#include <thrust/tuple.h> |
||||
#include "../warp.hpp" |
||||
#include "../warp_shuffle.hpp" |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
namespace reduce_key_val_detail |
||||
{ |
||||
template <typename T> struct GetType; |
||||
template <typename T> struct GetType<T*> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
template <typename T> struct GetType<volatile T*> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
template <typename T> struct GetType<T&> |
||||
{ |
||||
typedef T type; |
||||
}; |
||||
|
||||
template <unsigned int I, unsigned int N> |
||||
struct For |
||||
{ |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) |
||||
{ |
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(data); |
||||
|
||||
For<I + 1, N>::loadToSmem(smem, data, tid); |
||||
} |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) |
||||
{ |
||||
thrust::get<I>(data) = thrust::get<I>(smem)[tid]; |
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, data, tid); |
||||
} |
||||
|
||||
template <class ReferenceTuple> |
||||
static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) |
||||
{ |
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width); |
||||
|
||||
For<I + 1, N>::copyShfl(val, delta, width); |
||||
} |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) |
||||
{ |
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta]; |
||||
|
||||
For<I + 1, N>::copy(svals, val, tid, delta); |
||||
} |
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple> |
||||
static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) |
||||
{ |
||||
typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width); |
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key))) |
||||
{ |
||||
thrust::get<I>(key) = reg; |
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width); |
||||
} |
||||
|
||||
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width); |
||||
} |
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple> |
||||
static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, |
||||
const ValPointerTuple& svals, const ValReferenceTuple& val, |
||||
const CmpTuple& cmp, |
||||
unsigned int tid, unsigned int delta) |
||||
{ |
||||
typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta]; |
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key))) |
||||
{ |
||||
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg; |
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta]; |
||||
} |
||||
|
||||
For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta); |
||||
} |
||||
}; |
||||
template <unsigned int N> |
||||
struct For<N, N> |
||||
{ |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) |
||||
{ |
||||
} |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) |
||||
{ |
||||
} |
||||
|
||||
template <class ReferenceTuple> |
||||
static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) |
||||
{ |
||||
} |
||||
template <class PointerTuple, class ReferenceTuple> |
||||
static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) |
||||
{ |
||||
} |
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple> |
||||
static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) |
||||
{ |
||||
} |
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple> |
||||
static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, |
||||
const ValPointerTuple&, const ValReferenceTuple&, |
||||
const CmpTuple&, |
||||
unsigned int, unsigned int) |
||||
{ |
||||
} |
||||
}; |
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// loadToSmem
|
||||
|
||||
template <typename T> |
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) |
||||
{ |
||||
smem[tid] = data; |
||||
} |
||||
template <typename T> |
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) |
||||
{ |
||||
data = smem[tid]; |
||||
} |
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> |
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data, |
||||
unsigned int tid) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid); |
||||
} |
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> |
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data, |
||||
unsigned int tid) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// copyVals
|
||||
|
||||
template <typename V> |
||||
__device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) |
||||
{ |
||||
val = shfl_down(val, delta, width); |
||||
} |
||||
template <typename V> |
||||
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) |
||||
{ |
||||
svals[tid] = val = svals[tid + delta]; |
||||
} |
||||
template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> |
||||
__device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
unsigned int delta, |
||||
int width) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width); |
||||
} |
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> |
||||
__device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
unsigned int tid, unsigned int delta) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// merge
|
||||
|
||||
template <typename K, typename V, class Cmp> |
||||
__device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) |
||||
{ |
||||
K reg = shfl_down(key, delta, width); |
||||
|
||||
if (cmp(reg, key)) |
||||
{ |
||||
key = reg; |
||||
copyValsShfl(val, delta, width); |
||||
} |
||||
} |
||||
template <typename K, typename V, class Cmp> |
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) |
||||
{ |
||||
K reg = skeys[tid + delta]; |
||||
|
||||
if (cmp(reg, key)) |
||||
{ |
||||
skeys[tid] = key = reg; |
||||
copyVals(svals, val, tid, delta); |
||||
} |
||||
} |
||||
template <typename K, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp> |
||||
__device__ __forceinline__ void mergeShfl(K& key, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
const Cmp& cmp, |
||||
unsigned int delta, int width) |
||||
{ |
||||
K reg = shfl_down(key, delta, width); |
||||
|
||||
if (cmp(reg, key)) |
||||
{ |
||||
key = reg; |
||||
copyValsShfl(val, delta, width); |
||||
} |
||||
} |
||||
template <typename K, |
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp> |
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
const Cmp& cmp, unsigned int tid, unsigned int delta) |
||||
{ |
||||
K reg = skeys[tid + delta]; |
||||
|
||||
if (cmp(reg, key)) |
||||
{ |
||||
skeys[tid] = key = reg; |
||||
copyVals(svals, val, tid, delta); |
||||
} |
||||
} |
||||
template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9> |
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp, |
||||
unsigned int delta, int width) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width); |
||||
} |
||||
template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9, |
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9, |
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9> |
||||
__device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys, |
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp, |
||||
unsigned int tid, unsigned int delta) |
||||
{ |
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta); |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Generic
|
||||
|
||||
template <unsigned int N> struct Generic |
||||
{ |
||||
template <class KP, class KR, class VP, class VR, class Cmp> |
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) |
||||
{ |
||||
loadToSmem(skeys, key, tid); |
||||
loadValsToSmem(svals, val, tid); |
||||
if (N >= 32) |
||||
__syncthreads(); |
||||
|
||||
if (N >= 2048) |
||||
{ |
||||
if (tid < 1024) |
||||
merge(skeys, key, svals, val, cmp, tid, 1024); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 1024) |
||||
{ |
||||
if (tid < 512) |
||||
merge(skeys, key, svals, val, cmp, tid, 512); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 512) |
||||
{ |
||||
if (tid < 256) |
||||
merge(skeys, key, svals, val, cmp, tid, 256); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 256) |
||||
{ |
||||
if (tid < 128) |
||||
merge(skeys, key, svals, val, cmp, tid, 128); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 128) |
||||
{ |
||||
if (tid < 64) |
||||
merge(skeys, key, svals, val, cmp, tid, 64); |
||||
|
||||
__syncthreads(); |
||||
} |
||||
if (N >= 64) |
||||
{ |
||||
if (tid < 32) |
||||
merge(skeys, key, svals, val, cmp, tid, 32); |
||||
} |
||||
|
||||
if (tid < 16) |
||||
{ |
||||
merge(skeys, key, svals, val, cmp, tid, 16); |
||||
merge(skeys, key, svals, val, cmp, tid, 8); |
||||
merge(skeys, key, svals, val, cmp, tid, 4); |
||||
merge(skeys, key, svals, val, cmp, tid, 2); |
||||
merge(skeys, key, svals, val, cmp, tid, 1); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp> |
||||
struct Unroll |
||||
{ |
||||
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) |
||||
{ |
||||
mergeShfl(key, val, cmp, I, N); |
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N); |
||||
} |
||||
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) |
||||
{ |
||||
merge(skeys, key, svals, val, cmp, tid, I); |
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); |
||||
} |
||||
}; |
||||
template <class KP, class KR, class VP, class VR, class Cmp> |
||||
struct Unroll<0, KP, KR, VP, VR, Cmp> |
||||
{ |
||||
static __device__ void loopShfl(KR, VR, Cmp, unsigned int) |
||||
{ |
||||
} |
||||
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) |
||||
{ |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int N> struct WarpOptimized |
||||
{ |
||||
template <class KP, class KR, class VP, class VR, class Cmp> |
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) |
||||
{ |
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
(void) skeys; |
||||
(void) svals; |
||||
(void) tid; |
||||
|
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N); |
||||
#else |
||||
loadToSmem(skeys, key, tid); |
||||
loadToSmem(svals, val, tid); |
||||
|
||||
if (tid < N / 2) |
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); |
||||
#endif |
||||
} |
||||
}; |
||||
|
||||
template <unsigned int N> struct GenericOptimized32 |
||||
{ |
||||
enum { M = N / 32 }; |
||||
|
||||
template <class KP, class KR, class VP, class VR, class Cmp> |
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) |
||||
{ |
||||
const unsigned int laneId = Warp::laneId(); |
||||
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); |
||||
|
||||
if (laneId == 0) |
||||
{ |
||||
loadToSmem(skeys, key, tid / 32); |
||||
loadToSmem(svals, val, tid / 32); |
||||
} |
||||
#else |
||||
loadToSmem(skeys, key, tid); |
||||
loadToSmem(svals, val, tid); |
||||
|
||||
if (laneId < 16) |
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (laneId == 0) |
||||
{ |
||||
loadToSmem(skeys, key, tid / 32); |
||||
loadToSmem(svals, val, tid / 32); |
||||
} |
||||
#endif |
||||
|
||||
__syncthreads(); |
||||
|
||||
loadFromSmem(skeys, key, tid); |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
loadFromSmem(svals, val, tid); |
||||
|
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M); |
||||
#else |
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); |
||||
#endif |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf; |
||||
template <class T1, class T2> struct StaticIf<true, T1, T2> |
||||
{ |
||||
typedef T1 type; |
||||
}; |
||||
template <class T1, class T2> struct StaticIf<false, T1, T2> |
||||
{ |
||||
typedef T2 type; |
||||
}; |
||||
|
||||
template <unsigned int N> struct IsPowerOf2 |
||||
{ |
||||
enum { value = ((N != 0) && !(N & (N - 1))) }; |
||||
}; |
||||
|
||||
template <unsigned int N> struct Dispatcher |
||||
{ |
||||
typedef typename StaticIf< |
||||
(N <= 32) && IsPowerOf2<N>::value, |
||||
WarpOptimized<N>, |
||||
typename StaticIf< |
||||
(N <= 1024) && IsPowerOf2<N>::value, |
||||
GenericOptimized32<N>, |
||||
Generic<N> |
||||
>::type |
||||
>::type reductor; |
||||
}; |
||||
} |
||||
}}} |
||||
|
||||
#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
|
@ -1,841 +0,0 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ |
||||
#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
namespace utility_detail |
||||
{ |
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reductor
|
||||
|
||||
template <int n> struct WarpReductor |
||||
{ |
||||
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
if (tid < n) |
||||
data[tid] = partial_reduction; |
||||
if (n > 32) __syncthreads(); |
||||
|
||||
if (n > 32) |
||||
{ |
||||
if (tid < n - 32) |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); |
||||
if (tid < 16) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
else if (n > 16) |
||||
{ |
||||
if (tid < n - 16) |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
||||
if (tid < 8) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
else if (n > 8) |
||||
{ |
||||
if (tid < n - 8) |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); |
||||
if (tid < 4) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
else if (n > 4) |
||||
{ |
||||
if (tid < n - 4) |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
||||
if (tid < 2) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
else if (n > 2) |
||||
{ |
||||
if (tid < n - 2) |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
if (tid < 2) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
template <> struct WarpReductor<64> |
||||
{ |
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
data[tid] = partial_reduction; |
||||
__syncthreads(); |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
||||
} |
||||
} |
||||
}; |
||||
template <> struct WarpReductor<32> |
||||
{ |
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
data[tid] = partial_reduction; |
||||
|
||||
if (tid < 16) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
||||
} |
||||
} |
||||
}; |
||||
template <> struct WarpReductor<16> |
||||
{ |
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
data[tid] = partial_reduction; |
||||
|
||||
if (tid < 8) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
||||
} |
||||
} |
||||
}; |
||||
template <> struct WarpReductor<8> |
||||
{ |
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
data[tid] = partial_reduction; |
||||
|
||||
if (tid < 4) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <bool warp> struct ReductionDispatcher; |
||||
template <> struct ReductionDispatcher<true> |
||||
{ |
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
WarpReductor<n>::reduce(data, partial_reduction, tid, op); |
||||
} |
||||
}; |
||||
template <> struct ReductionDispatcher<false> |
||||
{ |
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) |
||||
{ |
||||
if (tid < n) |
||||
data[tid] = partial_reduction; |
||||
__syncthreads(); |
||||
|
||||
|
||||
if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); } |
||||
if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); } |
||||
if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); } |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); |
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PredValWarpReductor
|
||||
|
||||
template <int n> struct PredValWarpReductor; |
||||
template <> struct PredValWarpReductor<64> |
||||
{ |
||||
template <typename T, typename V, typename Pred> |
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 32) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal = sval[tid]; |
||||
|
||||
T reg = sdata[tid + 32]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 32]; |
||||
} |
||||
|
||||
reg = sdata[tid + 16]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 16]; |
||||
} |
||||
|
||||
reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
template <> struct PredValWarpReductor<32> |
||||
{ |
||||
template <typename T, typename V, typename Pred> |
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 16) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal = sval[tid]; |
||||
|
||||
T reg = sdata[tid + 16]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 16]; |
||||
} |
||||
|
||||
reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <> struct PredValWarpReductor<16> |
||||
{ |
||||
template <typename T, typename V, typename Pred> |
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 8) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal = sval[tid]; |
||||
|
||||
T reg = reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
template <> struct PredValWarpReductor<8> |
||||
{ |
||||
template <typename T, typename V, typename Pred> |
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 4) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal = sval[tid]; |
||||
|
||||
T reg = reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <bool warp> struct PredValReductionDispatcher; |
||||
template <> struct PredValReductionDispatcher<true> |
||||
{ |
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred); |
||||
} |
||||
}; |
||||
template <> struct PredValReductionDispatcher<false> |
||||
{ |
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal = sval[tid]; |
||||
|
||||
if (n >= 512 && tid < 256) |
||||
{ |
||||
T reg = sdata[tid + 256]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 256]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
if (n >= 256 && tid < 128) |
||||
{ |
||||
T reg = sdata[tid + 128]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 128]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
if (n >= 128 && tid < 64) |
||||
{ |
||||
T reg = sdata[tid + 64]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 64]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
if (n >= 64) |
||||
{ |
||||
T reg = sdata[tid + 32]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 32]; |
||||
} |
||||
} |
||||
if (n >= 32) |
||||
{ |
||||
T reg = sdata[tid + 16]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 16]; |
||||
} |
||||
} |
||||
if (n >= 16) |
||||
{ |
||||
T reg = sdata[tid + 8]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 8]; |
||||
} |
||||
} |
||||
if (n >= 8) |
||||
{ |
||||
T reg = sdata[tid + 4]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 4]; |
||||
} |
||||
} |
||||
if (n >= 4) |
||||
{ |
||||
T reg = sdata[tid + 2]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 2]; |
||||
} |
||||
} |
||||
if (n >= 2) |
||||
{ |
||||
T reg = sdata[tid + 1]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval[tid] = myVal = sval[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// PredVal2WarpReductor
|
||||
|
||||
template <int n> struct PredVal2WarpReductor; |
||||
template <> struct PredVal2WarpReductor<64> |
||||
{ |
||||
template <typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 32) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal1 = sval1[tid]; |
||||
myVal2 = sval2[tid]; |
||||
|
||||
T reg = sdata[tid + 32]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 32]; |
||||
sval2[tid] = myVal2 = sval2[tid + 32]; |
||||
} |
||||
|
||||
reg = sdata[tid + 16]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 16]; |
||||
sval2[tid] = myVal2 = sval2[tid + 16]; |
||||
} |
||||
|
||||
reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 8]; |
||||
sval2[tid] = myVal2 = sval2[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 4]; |
||||
sval2[tid] = myVal2 = sval2[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 2]; |
||||
sval2[tid] = myVal2 = sval2[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 1]; |
||||
sval2[tid] = myVal2 = sval2[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
template <> struct PredVal2WarpReductor<32> |
||||
{ |
||||
template <typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 16) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal1 = sval1[tid]; |
||||
myVal2 = sval2[tid]; |
||||
|
||||
T reg = sdata[tid + 16]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 16]; |
||||
sval2[tid] = myVal2 = sval2[tid + 16]; |
||||
} |
||||
|
||||
reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 8]; |
||||
sval2[tid] = myVal2 = sval2[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 4]; |
||||
sval2[tid] = myVal2 = sval2[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 2]; |
||||
sval2[tid] = myVal2 = sval2[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 1]; |
||||
sval2[tid] = myVal2 = sval2[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <> struct PredVal2WarpReductor<16> |
||||
{ |
||||
template <typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 8) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal1 = sval1[tid]; |
||||
myVal2 = sval2[tid]; |
||||
|
||||
T reg = reg = sdata[tid + 8]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 8]; |
||||
sval2[tid] = myVal2 = sval2[tid + 8]; |
||||
} |
||||
|
||||
reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 4]; |
||||
sval2[tid] = myVal2 = sval2[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 2]; |
||||
sval2[tid] = myVal2 = sval2[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 1]; |
||||
sval2[tid] = myVal2 = sval2[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
template <> struct PredVal2WarpReductor<8> |
||||
{ |
||||
template <typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
if (tid < 4) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal1 = sval1[tid]; |
||||
myVal2 = sval2[tid]; |
||||
|
||||
T reg = reg = sdata[tid + 4]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 4]; |
||||
sval2[tid] = myVal2 = sval2[tid + 4]; |
||||
} |
||||
|
||||
reg = sdata[tid + 2]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 2]; |
||||
sval2[tid] = myVal2 = sval2[tid + 2]; |
||||
} |
||||
|
||||
reg = sdata[tid + 1]; |
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 1]; |
||||
sval2[tid] = myVal2 = sval2[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
template <bool warp> struct PredVal2ReductionDispatcher; |
||||
template <> struct PredVal2ReductionDispatcher<true> |
||||
{ |
||||
template <int n, typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); |
||||
} |
||||
}; |
||||
template <> struct PredVal2ReductionDispatcher<false> |
||||
{ |
||||
template <int n, typename T, typename V1, typename V2, typename Pred> |
||||
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) |
||||
{ |
||||
myData = sdata[tid]; |
||||
myVal1 = sval1[tid]; |
||||
myVal2 = sval2[tid]; |
||||
|
||||
if (n >= 512 && tid < 256) |
||||
{ |
||||
T reg = sdata[tid + 256]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 256]; |
||||
sval2[tid] = myVal2 = sval2[tid + 256]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
if (n >= 256 && tid < 128) |
||||
{ |
||||
T reg = sdata[tid + 128]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 128]; |
||||
sval2[tid] = myVal2 = sval2[tid + 128]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
if (n >= 128 && tid < 64) |
||||
{ |
||||
T reg = sdata[tid + 64]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 64]; |
||||
sval2[tid] = myVal2 = sval2[tid + 64]; |
||||
} |
||||
__syncthreads(); |
||||
} |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
if (n >= 64) |
||||
{ |
||||
T reg = sdata[tid + 32]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 32]; |
||||
sval2[tid] = myVal2 = sval2[tid + 32]; |
||||
} |
||||
} |
||||
if (n >= 32) |
||||
{ |
||||
T reg = sdata[tid + 16]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 16]; |
||||
sval2[tid] = myVal2 = sval2[tid + 16]; |
||||
} |
||||
} |
||||
if (n >= 16) |
||||
{ |
||||
T reg = sdata[tid + 8]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 8]; |
||||
sval2[tid] = myVal2 = sval2[tid + 8]; |
||||
} |
||||
} |
||||
if (n >= 8) |
||||
{ |
||||
T reg = sdata[tid + 4]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 4]; |
||||
sval2[tid] = myVal2 = sval2[tid + 4]; |
||||
} |
||||
} |
||||
if (n >= 4) |
||||
{ |
||||
T reg = sdata[tid + 2]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 2]; |
||||
sval2[tid] = myVal2 = sval2[tid + 2]; |
||||
} |
||||
} |
||||
if (n >= 2) |
||||
{ |
||||
T reg = sdata[tid + 1]; |
||||
|
||||
if (pred(reg, myData)) |
||||
{ |
||||
sdata[tid] = myData = reg; |
||||
sval1[tid] = myVal1 = sval1[tid + 1]; |
||||
sval2[tid] = myVal2 = sval2[tid + 1]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
} // namespace utility_detail
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__
|
@ -0,0 +1,197 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_REDUCE_HPP__ |
||||
#define __OPENCV_GPU_REDUCE_HPP__ |
||||
|
||||
#include <thrust/tuple.h> |
||||
#include "detail/reduce.hpp" |
||||
#include "detail/reduce_key_val.hpp" |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
template <int N, typename T, class Op> |
||||
__device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op) |
||||
{ |
||||
reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op); |
||||
} |
||||
template <int N, |
||||
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, |
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9, |
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9> |
||||
__device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem, |
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val, |
||||
unsigned int tid, |
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op) |
||||
{ |
||||
reduce_detail::Dispatcher<N>::reductor::template reduce< |
||||
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&, |
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&, |
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op); |
||||
} |
||||
|
||||
template <unsigned int N, typename K, typename V, class Cmp> |
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) |
||||
{ |
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp); |
||||
} |
||||
template <unsigned int N, |
||||
typename K, |
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp> |
||||
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
unsigned int tid, const Cmp& cmp) |
||||
{ |
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&, |
||||
const Cmp&>(skeys, key, svals, val, tid, cmp); |
||||
} |
||||
template <unsigned int N, |
||||
typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9, |
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9, |
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, |
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, |
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9> |
||||
__device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys, |
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, |
||||
unsigned int tid, |
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp) |
||||
{ |
||||
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce< |
||||
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&, |
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&, |
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&, |
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&, |
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& |
||||
>(skeys, key, svals, val, tid, cmp); |
||||
} |
||||
|
||||
// smem_tuple
|
||||
|
||||
template <typename T0> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*> |
||||
smem_tuple(T0* t0) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0); |
||||
} |
||||
|
||||
template <typename T0, typename T1> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*> |
||||
smem_tuple(T0* t0, T1* t1) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8); |
||||
} |
||||
|
||||
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> |
||||
__device__ __forceinline__ |
||||
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*> |
||||
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) |
||||
{ |
||||
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9); |
||||
} |
||||
}}} |
||||
|
||||
#endif // __OPENCV_GPU_UTILITY_HPP__
|
@ -0,0 +1,145 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__ |
||||
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__ |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
template <typename T> |
||||
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return __shfl(val, srcLane, width); |
||||
#else |
||||
return T(); |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return (unsigned int) __shfl((int) val, srcLane, width); |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
int lo = __double2loint(val); |
||||
int hi = __double2hiint(val); |
||||
|
||||
lo = __shfl(lo, srcLane, width); |
||||
hi = __shfl(hi, srcLane, width); |
||||
|
||||
return __hiloint2double(hi, lo); |
||||
#else |
||||
return 0.0; |
||||
#endif |
||||
} |
||||
|
||||
template <typename T> |
||||
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return __shfl_down(val, delta, width); |
||||
#else |
||||
return T(); |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return (unsigned int) __shfl_down((int) val, delta, width); |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
int lo = __double2loint(val); |
||||
int hi = __double2hiint(val); |
||||
|
||||
lo = __shfl_down(lo, delta, width); |
||||
hi = __shfl_down(hi, delta, width); |
||||
|
||||
return __hiloint2double(hi, lo); |
||||
#else |
||||
return 0.0; |
||||
#endif |
||||
} |
||||
|
||||
template <typename T> |
||||
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return __shfl_up(val, delta, width); |
||||
#else |
||||
return T(); |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
return (unsigned int) __shfl_up((int) val, delta, width); |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize) |
||||
{ |
||||
#if __CUDA_ARCH__ >= 300 |
||||
int lo = __double2loint(val); |
||||
int hi = __double2hiint(val); |
||||
|
||||
lo = __shfl_up(lo, delta, width); |
||||
hi = __shfl_up(hi, delta, width); |
||||
|
||||
return __hiloint2double(hi, lo); |
||||
#else |
||||
return 0.0; |
||||
#endif |
||||
} |
||||
}}} |
||||
|
||||
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__
|
@ -0,0 +1,26 @@ |
||||
set(CMAKE_SYSTEM_NAME Linux) |
||||
set(CMAKE_SYSTEM_VERSION 1) |
||||
set(CMAKE_SYSTEM_PROCESSOR arm) |
||||
|
||||
set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc-4.5) |
||||
set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++-4.5) |
||||
|
||||
#suppress compiller varning |
||||
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" ) |
||||
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-psabi" ) |
||||
|
||||
# can be any other plases |
||||
set(__arm_linux_eabi_root /usr/arm-linux-gnueabi) |
||||
|
||||
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root}) |
||||
|
||||
if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR}) |
||||
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR}) |
||||
endif() |
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) |
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) |
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) |
||||
|
||||
set(CARMA 1) |
||||
add_definitions(-DCARMA) |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,279 @@ |
||||
#include "perf_precomp.hpp" |
||||
|
||||
#define GPU_PERF_TEST_P(fixture, name, params) \ |
||||
class fixture##_##name : public fixture {\
|
||||
public:\
|
||||
fixture##_##name() {}\
|
||||
protected:\
|
||||
virtual void __cpu();\
|
||||
virtual void __gpu();\
|
||||
virtual void PerfTestBody();\
|
||||
};\
|
||||
TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); }\
|
||||
INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
|
||||
void fixture##_##name::PerfTestBody() { if (PERF_RUN_GPU()) __gpu(); else __cpu(); } |
||||
|
||||
#define RUN_CPU(fixture, name)\ |
||||
void fixture##_##name::__cpu() |
||||
|
||||
#define RUN_GPU(fixture, name)\ |
||||
void fixture##_##name::__gpu() |
||||
|
||||
#define NO_CPU(fixture, name)\ |
||||
void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";} |
||||
|
||||
namespace { |
||||
struct DetectionLess |
||||
{ |
||||
bool operator()(const cv::gpu::SCascade::Detection& a, |
||||
const cv::gpu::SCascade::Detection& b) const |
||||
{ |
||||
if (a.x != b.x) return a.x < b.x; |
||||
else if (a.y != b.y) return a.y < b.y; |
||||
else if (a.w != b.w) return a.w < b.w; |
||||
else return a.h < b.h; |
||||
} |
||||
}; |
||||
|
||||
cv::Mat sortDetections(cv::gpu::GpuMat& objects) |
||||
{ |
||||
cv::Mat detections(objects); |
||||
|
||||
typedef cv::gpu::SCascade::Detection Detection; |
||||
Detection* begin = (Detection*)(detections.ptr<char>(0)); |
||||
Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols); |
||||
std::sort(begin, end, DetectionLess()); |
||||
|
||||
return detections; |
||||
} |
||||
} |
||||
|
||||
|
||||
typedef std::tr1::tuple<std::string, std::string> fixture_t; |
||||
typedef perf::TestBaseWithParam<fixture_t> SCascadeTest; |
||||
|
||||
GPU_PERF_TEST_P(SCascadeTest, detect, |
||||
testing::Combine( |
||||
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), |
||||
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) |
||||
|
||||
RUN_GPU(SCascadeTest, detect) |
||||
{ |
||||
cv::Mat cpu = readImage (GET_PARAM(1)); |
||||
ASSERT_FALSE(cpu.empty()); |
||||
cv::gpu::GpuMat colored(cpu); |
||||
|
||||
cv::gpu::SCascade cascade; |
||||
|
||||
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fs.isOpened()); |
||||
|
||||
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); |
||||
|
||||
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1); |
||||
rois.setTo(1); |
||||
|
||||
cascade.detect(colored, rois, objectBoxes); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cascade.detect(colored, rois, objectBoxes); |
||||
} |
||||
|
||||
SANITY_CHECK(sortDetections(objectBoxes)); |
||||
} |
||||
|
||||
NO_CPU(SCascadeTest, detect) |
||||
|
||||
static cv::Rect getFromTable(int idx) |
||||
{ |
||||
static const cv::Rect rois[] = |
||||
{ |
||||
cv::Rect( 65 * 4, 20 * 4, 35 * 4, 80 * 4), |
||||
cv::Rect( 95 * 4, 35 * 4, 45 * 4, 40 * 4), |
||||
cv::Rect( 45 * 4, 35 * 4, 45 * 4, 40 * 4), |
||||
cv::Rect( 25 * 4, 27 * 4, 50 * 4, 45 * 4), |
||||
cv::Rect(100 * 4, 50 * 4, 45 * 4, 40 * 4), |
||||
|
||||
cv::Rect( 60 * 4, 30 * 4, 45 * 4, 40 * 4), |
||||
cv::Rect( 40 * 4, 55 * 4, 50 * 4, 40 * 4), |
||||
cv::Rect( 48 * 4, 37 * 4, 72 * 4, 80 * 4), |
||||
cv::Rect( 48 * 4, 32 * 4, 85 * 4, 58 * 4), |
||||
cv::Rect( 48 * 4, 0 * 4, 32 * 4, 27 * 4) |
||||
}; |
||||
|
||||
return rois[idx]; |
||||
} |
||||
|
||||
typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t; |
||||
typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi; |
||||
|
||||
GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi, |
||||
testing::Combine( |
||||
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), |
||||
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), |
||||
testing::Range(0, 5))) |
||||
|
||||
RUN_GPU(SCascadeTestRoi, detectInRoi) |
||||
{ |
||||
cv::Mat cpu = readImage (GET_PARAM(1)); |
||||
ASSERT_FALSE(cpu.empty()); |
||||
cv::gpu::GpuMat colored(cpu); |
||||
|
||||
cv::gpu::SCascade cascade; |
||||
|
||||
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fs.isOpened()); |
||||
|
||||
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); |
||||
|
||||
cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); |
||||
rois.setTo(0); |
||||
|
||||
int nroi = GET_PARAM(2); |
||||
cv::RNG rng; |
||||
for (int i = 0; i < nroi; ++i) |
||||
{ |
||||
cv::Rect r = getFromTable(rng(10)); |
||||
cv::gpu::GpuMat sub(rois, r); |
||||
sub.setTo(1); |
||||
} |
||||
|
||||
cascade.detect(colored, rois, objectBoxes); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cascade.detect(colored, rois, objectBoxes); |
||||
} |
||||
|
||||
SANITY_CHECK(sortDetections(objectBoxes)); |
||||
} |
||||
|
||||
NO_CPU(SCascadeTestRoi, detectInRoi) |
||||
|
||||
|
||||
GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi, |
||||
testing::Combine( |
||||
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), |
||||
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), |
||||
testing::Range(0, 10))) |
||||
|
||||
RUN_GPU(SCascadeTestRoi, detectEachRoi) |
||||
{ |
||||
cv::Mat cpu = readImage (GET_PARAM(1)); |
||||
ASSERT_FALSE(cpu.empty()); |
||||
cv::gpu::GpuMat colored(cpu); |
||||
|
||||
cv::gpu::SCascade cascade; |
||||
|
||||
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fs.isOpened()); |
||||
|
||||
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); |
||||
|
||||
cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); |
||||
rois.setTo(0); |
||||
|
||||
int idx = GET_PARAM(2); |
||||
cv::Rect r = getFromTable(idx); |
||||
cv::gpu::GpuMat sub(rois, r); |
||||
sub.setTo(1); |
||||
|
||||
cascade.detect(colored, rois, objectBoxes); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cascade.detect(colored, rois, objectBoxes); |
||||
} |
||||
|
||||
SANITY_CHECK(sortDetections(objectBoxes)); |
||||
} |
||||
|
||||
NO_CPU(SCascadeTestRoi, detectEachRoi) |
||||
|
||||
GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral, |
||||
testing::Combine( |
||||
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), |
||||
testing::Values(std::string("cv/cascadeandhog/integrals.xml")))) |
||||
|
||||
static std::string itoa(long i) |
||||
{ |
||||
static char s[65]; |
||||
sprintf(s, "%ld", i); |
||||
return std::string(s); |
||||
} |
||||
|
||||
RUN_GPU(SCascadeTest, detectOnIntegral) |
||||
{ |
||||
cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fsi.isOpened()); |
||||
|
||||
cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1); |
||||
for (int i = 0; i < 10; ++i) |
||||
{ |
||||
cv::Mat channel; |
||||
fsi[std::string("channel") + itoa(i)] >> channel; |
||||
cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); |
||||
gchannel.upload(channel); |
||||
} |
||||
|
||||
cv::gpu::SCascade cascade; |
||||
|
||||
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fs.isOpened()); |
||||
|
||||
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); |
||||
|
||||
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1); |
||||
rois.setTo(1); |
||||
|
||||
cascade.detect(hogluv, rois, objectBoxes); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cascade.detect(hogluv, rois, objectBoxes); |
||||
} |
||||
|
||||
SANITY_CHECK(sortDetections(objectBoxes)); |
||||
} |
||||
|
||||
NO_CPU(SCascadeTest, detectOnIntegral) |
||||
|
||||
GPU_PERF_TEST_P(SCascadeTest, detectStream, |
||||
testing::Combine( |
||||
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), |
||||
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) |
||||
|
||||
RUN_GPU(SCascadeTest, detectStream) |
||||
{ |
||||
cv::Mat cpu = readImage (GET_PARAM(1)); |
||||
ASSERT_FALSE(cpu.empty()); |
||||
cv::gpu::GpuMat colored(cpu); |
||||
|
||||
cv::gpu::SCascade cascade; |
||||
|
||||
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); |
||||
ASSERT_TRUE(fs.isOpened()); |
||||
|
||||
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); |
||||
|
||||
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1); |
||||
rois.setTo(1); |
||||
|
||||
cv::gpu::Stream s; |
||||
|
||||
cascade.detect(colored, rois, objectBoxes, s); |
||||
|
||||
TEST_CYCLE() |
||||
{ |
||||
cascade.detect(colored, rois, objectBoxes, s); |
||||
} |
||||
|
||||
#ifdef HAVE_CUDA |
||||
cudaDeviceSynchronize(); |
||||
#endif |
||||
|
||||
SANITY_CHECK(sortDetections(objectBoxes)); |
||||
} |
||||
|
||||
NO_CPU(SCascadeTest, detectStream) |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "column_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -1,391 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "internal_shared.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
#include "opencv2/gpu/device/static_check.hpp" |
||||
|
||||
namespace cv { namespace gpu { namespace device |
||||
{ |
||||
namespace column_filter |
||||
{ |
||||
#define MAX_KERNEL_SIZE 32 |
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE]; |
||||
|
||||
void loadKernel(const float* kernel, int ksize, cudaStream_t stream) |
||||
{ |
||||
if (stream == 0) |
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); |
||||
else |
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); |
||||
} |
||||
|
||||
template <int KSIZE, typename T, typename D, typename B> |
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) |
||||
{ |
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) |
||||
const int BLOCK_DIM_X = 16; |
||||
const int BLOCK_DIM_Y = 16; |
||||
const int PATCH_PER_BLOCK = 4; |
||||
const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; |
||||
#else |
||||
const int BLOCK_DIM_X = 16; |
||||
const int BLOCK_DIM_Y = 8; |
||||
const int PATCH_PER_BLOCK = 2; |
||||
const int HALO_SIZE = 2; |
||||
#endif |
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
|
||||
__shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; |
||||
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; |
||||
|
||||
if (x >= src.cols) |
||||
return; |
||||
|
||||
const T* src_col = src.ptr() + x; |
||||
|
||||
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; |
||||
|
||||
if (blockIdx.y > 0) |
||||
{ |
||||
//Upper halo |
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); |
||||
} |
||||
else |
||||
{ |
||||
//Upper halo |
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); |
||||
} |
||||
|
||||
if (blockIdx.y + 2 < gridDim.y) |
||||
{ |
||||
//Main data |
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); |
||||
|
||||
//Lower halo |
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); |
||||
} |
||||
else |
||||
{ |
||||
//Main data |
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); |
||||
|
||||
//Lower halo |
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
{ |
||||
const int y = yStart + j * BLOCK_DIM_Y; |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
#pragma unroll |
||||
for (int k = 0; k < KSIZE; ++k) |
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; |
||||
|
||||
dst(y, x) = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B> |
||||
void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) |
||||
{ |
||||
int BLOCK_DIM_X; |
||||
int BLOCK_DIM_Y; |
||||
int PATCH_PER_BLOCK; |
||||
|
||||
if (cc >= 20) |
||||
{ |
||||
BLOCK_DIM_X = 16; |
||||
BLOCK_DIM_Y = 16; |
||||
PATCH_PER_BLOCK = 4; |
||||
} |
||||
else |
||||
{ |
||||
BLOCK_DIM_X = 16; |
||||
BLOCK_DIM_Y = 8; |
||||
PATCH_PER_BLOCK = 2; |
||||
} |
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); |
||||
|
||||
B<T> brd(src.rows); |
||||
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); |
||||
|
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template <typename T, typename D> |
||||
void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[5][33] = |
||||
{ |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller< 1, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 2, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 3, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 4, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 5, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 6, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 7, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 8, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller< 9, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<17, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<18, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<19, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<20, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<21, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<22, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<23, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<24, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<25, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<26, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<27, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<28, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<29, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<30, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<31, T, D, BrdColReflect101>, |
||||
linearColumnFilter_caller<32, T, D, BrdColReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller< 1, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 2, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 3, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 4, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 5, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 6, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 7, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 8, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller< 9, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<17, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<18, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<19, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<20, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<21, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<22, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<23, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<24, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<25, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<26, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<27, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<28, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<29, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<30, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<31, T, D, BrdColReplicate>, |
||||
linearColumnFilter_caller<32, T, D, BrdColReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller< 1, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 2, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 3, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 4, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 5, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 6, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 7, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 8, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller< 9, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<17, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<18, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<19, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<20, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<21, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<22, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<23, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<24, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<25, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<26, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<27, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<28, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<29, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<30, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<31, T, D, BrdColConstant>, |
||||
linearColumnFilter_caller<32, T, D, BrdColConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller< 1, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 2, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 3, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 4, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 5, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 6, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 7, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 8, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller< 9, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<17, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<18, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<19, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<20, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<21, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<22, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<23, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<24, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<25, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<26, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<27, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<28, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<29, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<30, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<31, T, D, BrdColReflect>, |
||||
linearColumnFilter_caller<32, T, D, BrdColReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
linearColumnFilter_caller< 1, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 2, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 3, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 4, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 5, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 6, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 7, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 8, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller< 9, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<17, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<18, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<19, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<20, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<21, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<22, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<23, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<24, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<25, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<26, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<27, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<28, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<29, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<30, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<31, T, D, BrdColWrap>, |
||||
linearColumnFilter_caller<32, T, D, BrdColWrap> |
||||
} |
||||
}; |
||||
|
||||
loadKernel(kernel, ksize, stream); |
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); |
||||
} |
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , int >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} // namespace column_filter |
||||
}}} // namespace cv { namespace gpu { namespace device |
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,373 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp" |
||||
#include "opencv2/gpu/device/saturate_cast.hpp" |
||||
#include "opencv2/gpu/device/vec_math.hpp" |
||||
#include "opencv2/gpu/device/border_interpolate.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace column_filter |
||||
{ |
||||
#define MAX_KERNEL_SIZE 32 |
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE]; |
||||
|
||||
template <int KSIZE, typename T, typename D, typename B> |
||||
__global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) |
||||
{ |
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) |
||||
const int BLOCK_DIM_X = 16; |
||||
const int BLOCK_DIM_Y = 16; |
||||
const int PATCH_PER_BLOCK = 4; |
||||
const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; |
||||
#else |
||||
const int BLOCK_DIM_X = 16; |
||||
const int BLOCK_DIM_Y = 8; |
||||
const int PATCH_PER_BLOCK = 2; |
||||
const int HALO_SIZE = 2; |
||||
#endif |
||||
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; |
||||
|
||||
__shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; |
||||
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; |
||||
|
||||
if (x >= src.cols) |
||||
return; |
||||
|
||||
const T* src_col = src.ptr() + x; |
||||
|
||||
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; |
||||
|
||||
if (blockIdx.y > 0) |
||||
{ |
||||
//Upper halo
|
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); |
||||
} |
||||
else |
||||
{ |
||||
//Upper halo
|
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); |
||||
} |
||||
|
||||
if (blockIdx.y + 2 < gridDim.y) |
||||
{ |
||||
//Main data
|
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); |
||||
|
||||
//Lower halo
|
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); |
||||
} |
||||
else |
||||
{ |
||||
//Main data
|
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); |
||||
|
||||
//Lower halo
|
||||
#pragma unroll |
||||
for (int j = 0; j < HALO_SIZE; ++j) |
||||
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < PATCH_PER_BLOCK; ++j) |
||||
{ |
||||
const int y = yStart + j * BLOCK_DIM_Y; |
||||
|
||||
if (y < src.rows) |
||||
{ |
||||
sum_t sum = VecTraits<sum_t>::all(0); |
||||
|
||||
#pragma unroll |
||||
for (int k = 0; k < KSIZE; ++k) |
||||
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; |
||||
|
||||
dst(y, x) = saturate_cast<D>(sum); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <int KSIZE, typename T, typename D, template<typename> class B> |
||||
void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) |
||||
{ |
||||
int BLOCK_DIM_X; |
||||
int BLOCK_DIM_Y; |
||||
int PATCH_PER_BLOCK; |
||||
|
||||
if (cc >= 20) |
||||
{ |
||||
BLOCK_DIM_X = 16; |
||||
BLOCK_DIM_Y = 16; |
||||
PATCH_PER_BLOCK = 4; |
||||
} |
||||
else |
||||
{ |
||||
BLOCK_DIM_X = 16; |
||||
BLOCK_DIM_Y = 8; |
||||
PATCH_PER_BLOCK = 2; |
||||
} |
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); |
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); |
||||
|
||||
B<T> brd(src.rows); |
||||
|
||||
linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); |
||||
|
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
} |
||||
|
||||
namespace filter |
||||
{ |
||||
template <typename T, typename D> |
||||
void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) |
||||
{ |
||||
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); |
||||
|
||||
static const caller_t callers[5][33] = |
||||
{ |
||||
{ |
||||
0, |
||||
column_filter::caller< 1, T, D, BrdColReflect101>, |
||||
column_filter::caller< 2, T, D, BrdColReflect101>, |
||||
column_filter::caller< 3, T, D, BrdColReflect101>, |
||||
column_filter::caller< 4, T, D, BrdColReflect101>, |
||||
column_filter::caller< 5, T, D, BrdColReflect101>, |
||||
column_filter::caller< 6, T, D, BrdColReflect101>, |
||||
column_filter::caller< 7, T, D, BrdColReflect101>, |
||||
column_filter::caller< 8, T, D, BrdColReflect101>, |
||||
column_filter::caller< 9, T, D, BrdColReflect101>, |
||||
column_filter::caller<10, T, D, BrdColReflect101>, |
||||
column_filter::caller<11, T, D, BrdColReflect101>, |
||||
column_filter::caller<12, T, D, BrdColReflect101>, |
||||
column_filter::caller<13, T, D, BrdColReflect101>, |
||||
column_filter::caller<14, T, D, BrdColReflect101>, |
||||
column_filter::caller<15, T, D, BrdColReflect101>, |
||||
column_filter::caller<16, T, D, BrdColReflect101>, |
||||
column_filter::caller<17, T, D, BrdColReflect101>, |
||||
column_filter::caller<18, T, D, BrdColReflect101>, |
||||
column_filter::caller<19, T, D, BrdColReflect101>, |
||||
column_filter::caller<20, T, D, BrdColReflect101>, |
||||
column_filter::caller<21, T, D, BrdColReflect101>, |
||||
column_filter::caller<22, T, D, BrdColReflect101>, |
||||
column_filter::caller<23, T, D, BrdColReflect101>, |
||||
column_filter::caller<24, T, D, BrdColReflect101>, |
||||
column_filter::caller<25, T, D, BrdColReflect101>, |
||||
column_filter::caller<26, T, D, BrdColReflect101>, |
||||
column_filter::caller<27, T, D, BrdColReflect101>, |
||||
column_filter::caller<28, T, D, BrdColReflect101>, |
||||
column_filter::caller<29, T, D, BrdColReflect101>, |
||||
column_filter::caller<30, T, D, BrdColReflect101>, |
||||
column_filter::caller<31, T, D, BrdColReflect101>, |
||||
column_filter::caller<32, T, D, BrdColReflect101> |
||||
}, |
||||
{ |
||||
0, |
||||
column_filter::caller< 1, T, D, BrdColReplicate>, |
||||
column_filter::caller< 2, T, D, BrdColReplicate>, |
||||
column_filter::caller< 3, T, D, BrdColReplicate>, |
||||
column_filter::caller< 4, T, D, BrdColReplicate>, |
||||
column_filter::caller< 5, T, D, BrdColReplicate>, |
||||
column_filter::caller< 6, T, D, BrdColReplicate>, |
||||
column_filter::caller< 7, T, D, BrdColReplicate>, |
||||
column_filter::caller< 8, T, D, BrdColReplicate>, |
||||
column_filter::caller< 9, T, D, BrdColReplicate>, |
||||
column_filter::caller<10, T, D, BrdColReplicate>, |
||||
column_filter::caller<11, T, D, BrdColReplicate>, |
||||
column_filter::caller<12, T, D, BrdColReplicate>, |
||||
column_filter::caller<13, T, D, BrdColReplicate>, |
||||
column_filter::caller<14, T, D, BrdColReplicate>, |
||||
column_filter::caller<15, T, D, BrdColReplicate>, |
||||
column_filter::caller<16, T, D, BrdColReplicate>, |
||||
column_filter::caller<17, T, D, BrdColReplicate>, |
||||
column_filter::caller<18, T, D, BrdColReplicate>, |
||||
column_filter::caller<19, T, D, BrdColReplicate>, |
||||
column_filter::caller<20, T, D, BrdColReplicate>, |
||||
column_filter::caller<21, T, D, BrdColReplicate>, |
||||
column_filter::caller<22, T, D, BrdColReplicate>, |
||||
column_filter::caller<23, T, D, BrdColReplicate>, |
||||
column_filter::caller<24, T, D, BrdColReplicate>, |
||||
column_filter::caller<25, T, D, BrdColReplicate>, |
||||
column_filter::caller<26, T, D, BrdColReplicate>, |
||||
column_filter::caller<27, T, D, BrdColReplicate>, |
||||
column_filter::caller<28, T, D, BrdColReplicate>, |
||||
column_filter::caller<29, T, D, BrdColReplicate>, |
||||
column_filter::caller<30, T, D, BrdColReplicate>, |
||||
column_filter::caller<31, T, D, BrdColReplicate>, |
||||
column_filter::caller<32, T, D, BrdColReplicate> |
||||
}, |
||||
{ |
||||
0, |
||||
column_filter::caller< 1, T, D, BrdColConstant>, |
||||
column_filter::caller< 2, T, D, BrdColConstant>, |
||||
column_filter::caller< 3, T, D, BrdColConstant>, |
||||
column_filter::caller< 4, T, D, BrdColConstant>, |
||||
column_filter::caller< 5, T, D, BrdColConstant>, |
||||
column_filter::caller< 6, T, D, BrdColConstant>, |
||||
column_filter::caller< 7, T, D, BrdColConstant>, |
||||
column_filter::caller< 8, T, D, BrdColConstant>, |
||||
column_filter::caller< 9, T, D, BrdColConstant>, |
||||
column_filter::caller<10, T, D, BrdColConstant>, |
||||
column_filter::caller<11, T, D, BrdColConstant>, |
||||
column_filter::caller<12, T, D, BrdColConstant>, |
||||
column_filter::caller<13, T, D, BrdColConstant>, |
||||
column_filter::caller<14, T, D, BrdColConstant>, |
||||
column_filter::caller<15, T, D, BrdColConstant>, |
||||
column_filter::caller<16, T, D, BrdColConstant>, |
||||
column_filter::caller<17, T, D, BrdColConstant>, |
||||
column_filter::caller<18, T, D, BrdColConstant>, |
||||
column_filter::caller<19, T, D, BrdColConstant>, |
||||
column_filter::caller<20, T, D, BrdColConstant>, |
||||
column_filter::caller<21, T, D, BrdColConstant>, |
||||
column_filter::caller<22, T, D, BrdColConstant>, |
||||
column_filter::caller<23, T, D, BrdColConstant>, |
||||
column_filter::caller<24, T, D, BrdColConstant>, |
||||
column_filter::caller<25, T, D, BrdColConstant>, |
||||
column_filter::caller<26, T, D, BrdColConstant>, |
||||
column_filter::caller<27, T, D, BrdColConstant>, |
||||
column_filter::caller<28, T, D, BrdColConstant>, |
||||
column_filter::caller<29, T, D, BrdColConstant>, |
||||
column_filter::caller<30, T, D, BrdColConstant>, |
||||
column_filter::caller<31, T, D, BrdColConstant>, |
||||
column_filter::caller<32, T, D, BrdColConstant> |
||||
}, |
||||
{ |
||||
0, |
||||
column_filter::caller< 1, T, D, BrdColReflect>, |
||||
column_filter::caller< 2, T, D, BrdColReflect>, |
||||
column_filter::caller< 3, T, D, BrdColReflect>, |
||||
column_filter::caller< 4, T, D, BrdColReflect>, |
||||
column_filter::caller< 5, T, D, BrdColReflect>, |
||||
column_filter::caller< 6, T, D, BrdColReflect>, |
||||
column_filter::caller< 7, T, D, BrdColReflect>, |
||||
column_filter::caller< 8, T, D, BrdColReflect>, |
||||
column_filter::caller< 9, T, D, BrdColReflect>, |
||||
column_filter::caller<10, T, D, BrdColReflect>, |
||||
column_filter::caller<11, T, D, BrdColReflect>, |
||||
column_filter::caller<12, T, D, BrdColReflect>, |
||||
column_filter::caller<13, T, D, BrdColReflect>, |
||||
column_filter::caller<14, T, D, BrdColReflect>, |
||||
column_filter::caller<15, T, D, BrdColReflect>, |
||||
column_filter::caller<16, T, D, BrdColReflect>, |
||||
column_filter::caller<17, T, D, BrdColReflect>, |
||||
column_filter::caller<18, T, D, BrdColReflect>, |
||||
column_filter::caller<19, T, D, BrdColReflect>, |
||||
column_filter::caller<20, T, D, BrdColReflect>, |
||||
column_filter::caller<21, T, D, BrdColReflect>, |
||||
column_filter::caller<22, T, D, BrdColReflect>, |
||||
column_filter::caller<23, T, D, BrdColReflect>, |
||||
column_filter::caller<24, T, D, BrdColReflect>, |
||||
column_filter::caller<25, T, D, BrdColReflect>, |
||||
column_filter::caller<26, T, D, BrdColReflect>, |
||||
column_filter::caller<27, T, D, BrdColReflect>, |
||||
column_filter::caller<28, T, D, BrdColReflect>, |
||||
column_filter::caller<29, T, D, BrdColReflect>, |
||||
column_filter::caller<30, T, D, BrdColReflect>, |
||||
column_filter::caller<31, T, D, BrdColReflect>, |
||||
column_filter::caller<32, T, D, BrdColReflect> |
||||
}, |
||||
{ |
||||
0, |
||||
column_filter::caller< 1, T, D, BrdColWrap>, |
||||
column_filter::caller< 2, T, D, BrdColWrap>, |
||||
column_filter::caller< 3, T, D, BrdColWrap>, |
||||
column_filter::caller< 4, T, D, BrdColWrap>, |
||||
column_filter::caller< 5, T, D, BrdColWrap>, |
||||
column_filter::caller< 6, T, D, BrdColWrap>, |
||||
column_filter::caller< 7, T, D, BrdColWrap>, |
||||
column_filter::caller< 8, T, D, BrdColWrap>, |
||||
column_filter::caller< 9, T, D, BrdColWrap>, |
||||
column_filter::caller<10, T, D, BrdColWrap>, |
||||
column_filter::caller<11, T, D, BrdColWrap>, |
||||
column_filter::caller<12, T, D, BrdColWrap>, |
||||
column_filter::caller<13, T, D, BrdColWrap>, |
||||
column_filter::caller<14, T, D, BrdColWrap>, |
||||
column_filter::caller<15, T, D, BrdColWrap>, |
||||
column_filter::caller<16, T, D, BrdColWrap>, |
||||
column_filter::caller<17, T, D, BrdColWrap>, |
||||
column_filter::caller<18, T, D, BrdColWrap>, |
||||
column_filter::caller<19, T, D, BrdColWrap>, |
||||
column_filter::caller<20, T, D, BrdColWrap>, |
||||
column_filter::caller<21, T, D, BrdColWrap>, |
||||
column_filter::caller<22, T, D, BrdColWrap>, |
||||
column_filter::caller<23, T, D, BrdColWrap>, |
||||
column_filter::caller<24, T, D, BrdColWrap>, |
||||
column_filter::caller<25, T, D, BrdColWrap>, |
||||
column_filter::caller<26, T, D, BrdColWrap>, |
||||
column_filter::caller<27, T, D, BrdColWrap>, |
||||
column_filter::caller<28, T, D, BrdColWrap>, |
||||
column_filter::caller<29, T, D, BrdColWrap>, |
||||
column_filter::caller<30, T, D, BrdColWrap>, |
||||
column_filter::caller<31, T, D, BrdColWrap>, |
||||
column_filter::caller<32, T, D, BrdColWrap> |
||||
} |
||||
}; |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); |
||||
else |
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); |
||||
|
||||
callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); |
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,563 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#include <opencv2/gpu/device/common.hpp> |
||||
#include <opencv2/gpu/device/saturate_cast.hpp> |
||||
|
||||
#include <icf.hpp> |
||||
#include <float.h> |
||||
#include <stdio.h> |
||||
|
||||
namespace cv { namespace gpu { namespace device { |
||||
namespace icf { |
||||
|
||||
template <int FACTOR> |
||||
__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x) |
||||
{ |
||||
int out = 0; |
||||
#pragma unroll |
||||
for(int dy = 0; dy < FACTOR; ++dy) |
||||
#pragma unroll |
||||
for(int dx = 0; dx < FACTOR; ++dx) |
||||
{ |
||||
out += ptr[dy * pitch + dx]; |
||||
} |
||||
|
||||
return static_cast<uchar>(out / (FACTOR * FACTOR)); |
||||
} |
||||
|
||||
template<int FACTOR> |
||||
__global__ void shrink(const uchar* __restrict__ hogluv, const int inPitch, |
||||
uchar* __restrict__ shrank, const int outPitch ) |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
||||
const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x); |
||||
|
||||
shrank[ y * outPitch + x] = shrink<FACTOR>(ptr, inPitch, y, x); |
||||
} |
||||
|
||||
void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(shrunk.cols / 32, shrunk.rows / 8); |
||||
shrink<4><<<grid, block>>>((uchar*)channels.ptr(), channels.step, (uchar*)shrunk.ptr(), shrunk.step); |
||||
cudaSafeCall(cudaDeviceSynchronize()); |
||||
} |
||||
|
||||
__device__ __forceinline__ void luv(const float& b, const float& g, const float& r, uchar& __l, uchar& __u, uchar& __v) |
||||
{ |
||||
// rgb -> XYZ |
||||
float x = 0.412453f * r + 0.357580f * g + 0.180423f * b; |
||||
float y = 0.212671f * r + 0.715160f * g + 0.072169f * b; |
||||
float z = 0.019334f * r + 0.119193f * g + 0.950227f * b; |
||||
|
||||
// computed for D65 |
||||
const float _ur = 0.19783303699678276f; |
||||
const float _vr = 0.46833047435252234f; |
||||
|
||||
const float divisor = fmax((x + 15.f * y + 3.f * z), FLT_EPSILON); |
||||
const float _u = __fdividef(4.f * x, divisor); |
||||
const float _v = __fdividef(9.f * y, divisor); |
||||
|
||||
float hack = static_cast<float>(__float2int_rn(y * 2047)) / 2047; |
||||
const float L = fmax(0.f, ((116.f * cbrtf(hack)) - 16.f)); |
||||
const float U = 13.f * L * (_u - _ur); |
||||
const float V = 13.f * L * (_v - _vr); |
||||
|
||||
// L in [0, 100], u in [-134, 220], v in [-140, 122] |
||||
__l = static_cast<uchar>( L * (255.f / 100.f)); |
||||
__u = static_cast<uchar>((U + 134.f) * (255.f / (220.f + 134.f ))); |
||||
__v = static_cast<uchar>((V + 140.f) * (255.f / (122.f + 140.f ))); |
||||
} |
||||
|
||||
__global__ void bgr2Luv_d(const uchar* rgb, const int rgbPitch, uchar* luvg, const int luvgPitch) |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
||||
uchar3 color = ((uchar3*)(rgb + rgbPitch * y))[x]; |
||||
uchar l, u, v; |
||||
luv(color.x / 255.f, color.y / 255.f, color.z / 255.f, l, u, v); |
||||
|
||||
luvg[luvgPitch * y + x] = l; |
||||
luvg[luvgPitch * (y + 480) + x] = u; |
||||
luvg[luvgPitch * (y + 2 * 480) + x] = v; |
||||
} |
||||
|
||||
void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(bgr.cols / 32, bgr.rows / 8); |
||||
|
||||
bgr2Luv_d<<<grid, block>>>((const uchar*)bgr.ptr(0), bgr.step, (uchar*)luv.ptr(0), luv.step); |
||||
|
||||
cudaSafeCall(cudaDeviceSynchronize()); |
||||
} |
||||
|
||||
template<bool isDefaultNum> |
||||
__device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy) |
||||
{ |
||||
const float angle_quantum = CV_PI / 6.f; |
||||
float angle = atan2(dx, dy) + (angle_quantum / 2.f); |
||||
|
||||
if (angle < 0) angle += CV_PI; |
||||
|
||||
const float angle_scaling = 1.f / angle_quantum; |
||||
return static_cast<int>(angle * angle_scaling) % 6; |
||||
} |
||||
|
||||
template<> |
||||
__device__ __forceinline__ int fast_angle_bin<true>(const float& dy, const float& dx) |
||||
{ |
||||
int index = 0; |
||||
|
||||
float max_dot = fabs(dx); |
||||
|
||||
{ |
||||
const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f); |
||||
|
||||
if(dot_product > max_dot) |
||||
{ |
||||
max_dot = dot_product; |
||||
index = 1; |
||||
} |
||||
} |
||||
{ |
||||
const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f); |
||||
|
||||
if(dot_product > max_dot) |
||||
{ |
||||
max_dot = dot_product; |
||||
index = 2; |
||||
} |
||||
} |
||||
{ |
||||
int i = 3; |
||||
float2 bin_vector_i; |
||||
bin_vector_i.x = ::cos(i * (CV_PI / 6.f)); |
||||
bin_vector_i.y = ::sin(i * (CV_PI / 6.f)); |
||||
|
||||
const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y); |
||||
if(dot_product > max_dot) |
||||
{ |
||||
max_dot = dot_product; |
||||
index = i; |
||||
} |
||||
} |
||||
{ |
||||
const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f); |
||||
if(dot_product > max_dot) |
||||
{ |
||||
max_dot = dot_product; |
||||
index = 4; |
||||
} |
||||
} |
||||
{ |
||||
const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f); |
||||
if(dot_product > max_dot) |
||||
{ |
||||
max_dot = dot_product; |
||||
index = 5; |
||||
} |
||||
} |
||||
return index; |
||||
} |
||||
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tgray; |
||||
|
||||
template<bool isDefaultNum> |
||||
__global__ void gray2hog(PtrStepSzb mag) |
||||
{ |
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
|
||||
const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0); |
||||
const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1); |
||||
|
||||
const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2)); |
||||
const uchar cmag = static_cast<uchar>(magnitude); |
||||
|
||||
mag( 480 * 6 + y, x) = cmag; |
||||
mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag; |
||||
} |
||||
|
||||
void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins) |
||||
{ |
||||
dim3 block(32, 8); |
||||
dim3 grid(gray.cols / 32, gray.rows / 8); |
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>(); |
||||
cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) ); |
||||
|
||||
if (bins == 6) |
||||
gray2hog<true><<<grid, block>>>(mag); |
||||
else |
||||
gray2hog<false><<<grid, block>>>(mag); |
||||
|
||||
cudaSafeCall(cudaDeviceSynchronize()); |
||||
} |
||||
|
||||
// ToDo: use textures or uncached load instruction. |
||||
__global__ void magToHist(const uchar* __restrict__ mag, |
||||
const float* __restrict__ angle, const int angPitch, |
||||
uchar* __restrict__ hog, const int hogPitch, const int fh) |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
|
||||
const int bin = (int)(angle[y * angPitch + x]); |
||||
const uchar val = mag[y * hogPitch + x]; |
||||
hog[((fh * bin) + y) * hogPitch + x] = val; |
||||
} |
||||
|
||||
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, |
||||
const int fw, const int fh, const int bins, cudaStream_t stream ) |
||||
{ |
||||
const uchar* mag = (const uchar*)hogluv.ptr(fh * bins); |
||||
uchar* hog = (uchar*)hogluv.ptr(); |
||||
const float* angle = (const float*)nangle.ptr(); |
||||
|
||||
dim3 block(32, 8); |
||||
dim3 grid(fw / 32, fh / 8); |
||||
|
||||
magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); |
||||
if (!stream) |
||||
{ |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b) |
||||
{ |
||||
int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x); |
||||
int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y); |
||||
|
||||
return (w < 0 || h < 0)? 0.f : (float)(w * h); |
||||
} |
||||
|
||||
texture<uint4, cudaTextureType2D, cudaReadModeElementType> tdetections; |
||||
|
||||
__global__ void overlap(const uint* n, uchar* overlaps) |
||||
{ |
||||
const int idx = threadIdx.x; |
||||
const int total = *n; |
||||
|
||||
for (int i = idx + 1; i < total; i += 192) |
||||
{ |
||||
const uint4 _a = tex2D(tdetections, i, 0); |
||||
const Detection& a = *((Detection*)(&_a)); |
||||
bool excluded = false; |
||||
|
||||
for (int j = i + 1; j < total; ++j) |
||||
{ |
||||
const uint4 _b = tex2D(tdetections, j, 0); |
||||
const Detection& b = *((Detection*)(&_b)); |
||||
float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h); |
||||
|
||||
if (ovl > 0.65f) |
||||
{ |
||||
int suppessed = (a.confidence > b.confidence)? j : i; |
||||
overlaps[suppessed] = 1; |
||||
excluded = excluded || (suppessed == i); |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ >= 120 |
||||
if (__all(excluded)) break; |
||||
#endif |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed) |
||||
{ |
||||
const int idx = threadIdx.x; |
||||
const int total = *n; |
||||
|
||||
for (int i = idx; i < total; i += 192) |
||||
{ |
||||
if (!overlaps[i]) |
||||
{ |
||||
int oidx = atomicInc(ctr, 50); |
||||
suppressed[oidx] = tex2D(tdetections, i + 1, 0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, |
||||
PtrStepSzb suppressed, cudaStream_t stream) |
||||
{ |
||||
int block = 192; |
||||
int grid = 1; |
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint4>(); |
||||
size_t offset; |
||||
cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step)); |
||||
|
||||
overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0)); |
||||
collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1); |
||||
|
||||
if (!stream) |
||||
{ |
||||
cudaSafeCall( cudaGetLastError()); |
||||
cudaSafeCall( cudaDeviceSynchronize()); |
||||
} |
||||
} |
||||
|
||||
template<typename Policy> |
||||
struct PrefixSum |
||||
{ |
||||
__device static void apply(float& impact) |
||||
{ |
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 |
||||
#pragma unroll |
||||
// scan on shuffl functions |
||||
for (int i = 1; i < Policy::WARP; i *= 2) |
||||
{ |
||||
const float n = __shfl_up(impact, i, Policy::WARP); |
||||
|
||||
if (threadIdx.x >= i) |
||||
impact += n; |
||||
} |
||||
#else |
||||
__shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y]; |
||||
|
||||
const int idx = threadIdx.y * Policy::STA_X + threadIdx.x; |
||||
|
||||
ptr[idx] = impact; |
||||
|
||||
if ( threadIdx.x >= 1) ptr [idx ] = (ptr [idx - 1] + ptr [idx]); |
||||
if ( threadIdx.x >= 2) ptr [idx ] = (ptr [idx - 2] + ptr [idx]); |
||||
if ( threadIdx.x >= 4) ptr [idx ] = (ptr [idx - 4] + ptr [idx]); |
||||
if ( threadIdx.x >= 8) ptr [idx ] = (ptr [idx - 8] + ptr [idx]); |
||||
if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]); |
||||
|
||||
impact = ptr[idx]; |
||||
#endif |
||||
} |
||||
}; |
||||
|
||||
texture<int, cudaTextureType2D, cudaReadModeElementType> thogluv; |
||||
|
||||
template<bool isUp> |
||||
__device__ __forceinline__ float rescale(const Level& level, Node& node) |
||||
{ |
||||
uchar4& scaledRect = node.rect; |
||||
float relScale = level.relScale; |
||||
float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); |
||||
|
||||
// rescale |
||||
scaledRect.x = __float2int_rn(relScale * scaledRect.x); |
||||
scaledRect.y = __float2int_rn(relScale * scaledRect.y); |
||||
scaledRect.z = __float2int_rn(relScale * scaledRect.z); |
||||
scaledRect.w = __float2int_rn(relScale * scaledRect.w); |
||||
|
||||
float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); |
||||
|
||||
const float expected_new_area = farea * relScale * relScale; |
||||
float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area); |
||||
|
||||
float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; |
||||
|
||||
return rootThreshold; |
||||
} |
||||
|
||||
template<> |
||||
__device__ __forceinline__ float rescale<true>(const Level& level, Node& node) |
||||
{ |
||||
uchar4& scaledRect = node.rect; |
||||
float relScale = level.relScale; |
||||
float farea = scaledRect.z * scaledRect.w; |
||||
|
||||
// rescale |
||||
scaledRect.x = __float2int_rn(relScale * scaledRect.x); |
||||
scaledRect.y = __float2int_rn(relScale * scaledRect.y); |
||||
scaledRect.z = __float2int_rn(relScale * scaledRect.z); |
||||
scaledRect.w = __float2int_rn(relScale * scaledRect.w); |
||||
|
||||
float sarea = scaledRect.z * scaledRect.w; |
||||
|
||||
const float expected_new_area = farea * relScale * relScale; |
||||
float approx = __fdividef(sarea, expected_new_area); |
||||
|
||||
float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; |
||||
|
||||
return rootThreshold; |
||||
} |
||||
|
||||
template<bool isUp> |
||||
__device__ __forceinline__ int get(int x, int y, uchar4 area) |
||||
{ |
||||
int a = tex2D(thogluv, x + area.x, y + area.y); |
||||
int b = tex2D(thogluv, x + area.z, y + area.y); |
||||
int c = tex2D(thogluv, x + area.z, y + area.w); |
||||
int d = tex2D(thogluv, x + area.x, y + area.w); |
||||
|
||||
return (a - b + c - d); |
||||
} |
||||
|
||||
template<> |
||||
__device__ __forceinline__ int get<true>(int x, int y, uchar4 area) |
||||
{ |
||||
x += area.x; |
||||
y += area.y; |
||||
int a = tex2D(thogluv, x, y); |
||||
int b = tex2D(thogluv, x + area.z, y); |
||||
int c = tex2D(thogluv, x + area.z, y + area.w); |
||||
int d = tex2D(thogluv, x, y + area.w); |
||||
|
||||
return (a - b + c - d); |
||||
} |
||||
|
||||
texture<float2, cudaTextureType2D, cudaReadModeElementType> troi; |
||||
|
||||
template<typename Policy> |
||||
template<bool isUp> |
||||
__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const |
||||
{ |
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = blockIdx.x; |
||||
|
||||
// load Lavel |
||||
__shared__ Level level; |
||||
|
||||
// check POI |
||||
__shared__ volatile char roiCache[Policy::STA_Y]; |
||||
|
||||
if (!threadIdx.y && !threadIdx.x) |
||||
((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); |
||||
|
||||
__syncthreads(); |
||||
|
||||
if (!roiCache[threadIdx.y]) return; |
||||
|
||||
if (!threadIdx.x) |
||||
level = levels[downscales + blockIdx.z]; |
||||
|
||||
if(x >= level.workRect.x || y >= level.workRect.y) return; |
||||
|
||||
int st = level.octave * level.step; |
||||
const int stEnd = st + level.step; |
||||
|
||||
const int hogluvStep = gridDim.y * Policy::STA_Y; |
||||
float confidence = 0.f; |
||||
for(; st < stEnd; st += Policy::WARP) |
||||
{ |
||||
const int nId = (st + threadIdx.x) * 3; |
||||
|
||||
Node node = nodes[nId]; |
||||
|
||||
float threshold = rescale<isUp>(level, node); |
||||
int sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect); |
||||
|
||||
int next = 1 + (int)(sum >= threshold); |
||||
|
||||
node = nodes[nId + next]; |
||||
threshold = rescale<isUp>(level, node); |
||||
sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect); |
||||
|
||||
const int lShift = (next - 1) * 2 + (int)(sum >= threshold); |
||||
float impact = leaves[(st + threadIdx.x) * 4 + lShift]; |
||||
|
||||
PrefixSum<Policy>::apply(impact); |
||||
confidence += impact; |
||||
|
||||
#if __CUDA_ARCH__ >= 120 |
||||
if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; |
||||
#endif |
||||
} |
||||
|
||||
if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) |
||||
{ |
||||
int idx = atomicInc(ctr, ndetections); |
||||
objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE), |
||||
__float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence); |
||||
} |
||||
} |
||||
|
||||
template<typename Policy, bool isUp> |
||||
__global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs) |
||||
{ |
||||
invoker.template detect<isUp>(objects, n, ctr, downs); |
||||
} |
||||
|
||||
template<typename Policy> |
||||
void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, |
||||
PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const |
||||
{ |
||||
int fw = roi.rows; |
||||
int fh = roi.cols; |
||||
|
||||
dim3 grid(fw, fh / Policy::STA_Y, downscales); |
||||
|
||||
uint* ctr = (uint*)(objects.ptr(0)); |
||||
Detection* det = ((Detection*)objects.ptr(0)) + 1; |
||||
uint max_det = objects.cols / sizeof(Detection); |
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>(); |
||||
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); |
||||
|
||||
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<typename Policy::roi_type>(); |
||||
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step)); |
||||
|
||||
const CascadeInvoker<Policy> inv = *this; |
||||
|
||||
soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0); |
||||
cudaSafeCall( cudaGetLastError()); |
||||
|
||||
grid = dim3(fw, fh / Policy::STA_Y, scales - downscales); |
||||
soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales); |
||||
|
||||
if (!stream) |
||||
{ |
||||
cudaSafeCall( cudaGetLastError()); |
||||
cudaSafeCall( cudaDeviceSynchronize()); |
||||
} |
||||
} |
||||
|
||||
template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, |
||||
PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const; |
||||
|
||||
} |
||||
}}} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,414 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "opencv2/gpu/device/common.hpp" |
||||
#include "opencv2/gpu/device/limits.hpp" |
||||
#include "opencv2/gpu/device/functional.hpp" |
||||
#include "opencv2/gpu/device/reduce.hpp" |
||||
|
||||
using namespace cv::gpu; |
||||
using namespace cv::gpu::device; |
||||
|
||||
namespace optflowbm |
||||
{ |
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp); |
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp); |
||||
|
||||
__device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize) |
||||
{ |
||||
int s = 0; |
||||
|
||||
for (int y = 0; y < blockSize.y; ++y) |
||||
{ |
||||
for (int x = 0; x < blockSize.x; ++x) |
||||
s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y)); |
||||
} |
||||
|
||||
return s; |
||||
} |
||||
|
||||
__global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious, |
||||
const int maxX, const int maxY, const int acceptLevel, const int escapeLevel, |
||||
const short2* ss, const int ssCount) |
||||
{ |
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x; |
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y; |
||||
|
||||
if (i >= velx.rows || j >= velx.cols) |
||||
return; |
||||
|
||||
const int X1 = j * shiftSize.x; |
||||
const int Y1 = i * shiftSize.y; |
||||
|
||||
const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0; |
||||
const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0; |
||||
|
||||
int X2 = X1 + offX; |
||||
int Y2 = Y1 + offY; |
||||
|
||||
int dist = numeric_limits<int>::max(); |
||||
|
||||
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) |
||||
dist = cmpBlocks(X1, Y1, X2, Y2, blockSize); |
||||
|
||||
int countMin = 1; |
||||
int sumx = offX; |
||||
int sumy = offY; |
||||
|
||||
if (dist > acceptLevel) |
||||
{ |
||||
// do brute-force search |
||||
for (int k = 0; k < ssCount; ++k) |
||||
{ |
||||
const short2 ssVal = ss[k]; |
||||
|
||||
const int dx = offX + ssVal.x; |
||||
const int dy = offY + ssVal.y; |
||||
|
||||
X2 = X1 + dx; |
||||
Y2 = Y1 + dy; |
||||
|
||||
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) |
||||
{ |
||||
const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize); |
||||
if (tmpDist < acceptLevel) |
||||
{ |
||||
sumx = dx; |
||||
sumy = dy; |
||||
countMin = 1; |
||||
break; |
||||
} |
||||
|
||||
if (tmpDist < dist) |
||||
{ |
||||
dist = tmpDist; |
||||
sumx = dx; |
||||
sumy = dy; |
||||
countMin = 1; |
||||
} |
||||
else if (tmpDist == dist) |
||||
{ |
||||
sumx += dx; |
||||
sumy += dy; |
||||
countMin++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (dist > escapeLevel) |
||||
{ |
||||
sumx = offX; |
||||
sumy = offY; |
||||
countMin = 1; |
||||
} |
||||
} |
||||
|
||||
velx(i, j) = static_cast<float>(sumx) / countMin; |
||||
vely(i, j) = static_cast<float>(sumy) / countMin; |
||||
} |
||||
|
||||
void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious, |
||||
int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream) |
||||
{ |
||||
bindTexture(&tex_prev, prev); |
||||
bindTexture(&tex_curr, curr); |
||||
|
||||
const dim3 block(32, 8); |
||||
const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y)); |
||||
|
||||
calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious, |
||||
maxX, maxY, acceptLevel, escapeLevel, ss, ssCount); |
||||
cudaSafeCall( cudaGetLastError() ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
} |
||||
|
||||
///////////////////////////////////////////////////////// |
||||
// Fast approximate version |
||||
|
||||
namespace optflowbm_fast |
||||
{ |
||||
enum |
||||
{ |
||||
CTA_SIZE = 128, |
||||
|
||||
TILE_COLS = 128, |
||||
TILE_ROWS = 32, |
||||
|
||||
STRIDE = CTA_SIZE |
||||
}; |
||||
|
||||
template <typename T> __device__ __forceinline__ int calcDist(T a, T b) |
||||
{ |
||||
return ::abs(a - b); |
||||
} |
||||
|
||||
template <class T> struct FastOptFlowBM |
||||
{ |
||||
|
||||
int search_radius; |
||||
int block_radius; |
||||
|
||||
int search_window; |
||||
int block_window; |
||||
|
||||
PtrStepSz<T> I0; |
||||
PtrStep<T> I1; |
||||
|
||||
mutable PtrStepi buffer; |
||||
|
||||
FastOptFlowBM(int search_window_, int block_window_, |
||||
PtrStepSz<T> I0_, PtrStepSz<T> I1_, |
||||
PtrStepi buffer_) : |
||||
search_radius(search_window_ / 2), block_radius(block_window_ / 2), |
||||
search_window(search_window_), block_window(block_window_), |
||||
I0(I0_), I1(I1_), |
||||
buffer(buffer_) |
||||
{ |
||||
} |
||||
|
||||
__device__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const |
||||
{ |
||||
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) |
||||
{ |
||||
dist_sums[index] = 0; |
||||
|
||||
for (int tx = 0; tx < block_window; ++tx) |
||||
col_sums(tx, index) = 0; |
||||
|
||||
int y = index / search_window; |
||||
int x = index - y * search_window; |
||||
|
||||
int ay = i; |
||||
int ax = j; |
||||
|
||||
int by = i + y - search_radius; |
||||
int bx = j + x - search_radius; |
||||
|
||||
for (int tx = -block_radius; tx <= block_radius; ++tx) |
||||
{ |
||||
int col_sum = 0; |
||||
for (int ty = -block_radius; ty <= block_radius; ++ty) |
||||
{ |
||||
int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx)); |
||||
|
||||
dist_sums[index] += dist; |
||||
col_sum += dist; |
||||
} |
||||
|
||||
col_sums(tx + block_radius, index) = col_sum; |
||||
} |
||||
|
||||
up_col_sums(j, index) = col_sums(block_window - 1, index); |
||||
} |
||||
} |
||||
|
||||
__device__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const |
||||
{ |
||||
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) |
||||
{ |
||||
int y = index / search_window; |
||||
int x = index - y * search_window; |
||||
|
||||
int ay = i; |
||||
int ax = j + block_radius; |
||||
|
||||
int by = i + y - search_radius; |
||||
int bx = j + x - search_radius + block_radius; |
||||
|
||||
int col_sum = 0; |
||||
|
||||
for (int ty = -block_radius; ty <= block_radius; ++ty) |
||||
col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx)); |
||||
|
||||
dist_sums[index] += col_sum - col_sums(first, index); |
||||
|
||||
col_sums(first, index) = col_sum; |
||||
up_col_sums(j, index) = col_sum; |
||||
} |
||||
} |
||||
|
||||
__device__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const |
||||
{ |
||||
int ay = i; |
||||
int ax = j + block_radius; |
||||
|
||||
T a_up = I0(ay - block_radius - 1, ax); |
||||
T a_down = I0(ay + block_radius, ax); |
||||
|
||||
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) |
||||
{ |
||||
int y = index / search_window; |
||||
int x = index - y * search_window; |
||||
|
||||
int by = i + y - search_radius; |
||||
int bx = j + x - search_radius + block_radius; |
||||
|
||||
T b_up = I1(by - block_radius - 1, bx); |
||||
T b_down = I1(by + block_radius, bx); |
||||
|
||||
int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up); |
||||
|
||||
dist_sums[index] += col_sum - col_sums(first, index); |
||||
col_sums(first, index) = col_sum; |
||||
up_col_sums(j, index) = col_sum; |
||||
} |
||||
} |
||||
|
||||
__device__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const |
||||
{ |
||||
int bestDist = numeric_limits<int>::max(); |
||||
int bestInd = -1; |
||||
|
||||
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE) |
||||
{ |
||||
int curDist = dist_sums[index]; |
||||
if (curDist < bestDist) |
||||
{ |
||||
bestDist = curDist; |
||||
bestInd = index; |
||||
} |
||||
} |
||||
|
||||
__shared__ int cta_dist_buffer[CTA_SIZE]; |
||||
__shared__ int cta_ind_buffer[CTA_SIZE]; |
||||
|
||||
reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>()); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
int y = bestInd / search_window; |
||||
int x = bestInd - y * search_window; |
||||
|
||||
velx = x - search_radius; |
||||
vely = y - search_radius; |
||||
} |
||||
} |
||||
|
||||
__device__ void operator()(PtrStepf velx, PtrStepf vely) const |
||||
{ |
||||
int tbx = blockIdx.x * TILE_COLS; |
||||
int tby = blockIdx.y * TILE_ROWS; |
||||
|
||||
int tex = ::min(tbx + TILE_COLS, I0.cols); |
||||
int tey = ::min(tby + TILE_ROWS, I0.rows); |
||||
|
||||
PtrStepi col_sums; |
||||
col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window; |
||||
col_sums.step = buffer.step; |
||||
|
||||
PtrStepi up_col_sums; |
||||
up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window; |
||||
up_col_sums.step = buffer.step; |
||||
|
||||
extern __shared__ int dist_sums[]; //search_window * search_window |
||||
|
||||
int first = 0; |
||||
|
||||
for (int i = tby; i < tey; ++i) |
||||
{ |
||||
for (int j = tbx; j < tex; ++j) |
||||
{ |
||||
__syncthreads(); |
||||
|
||||
if (j == tbx) |
||||
{ |
||||
initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums); |
||||
first = 0; |
||||
} |
||||
else |
||||
{ |
||||
if (i == tby) |
||||
shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums); |
||||
else |
||||
shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums); |
||||
|
||||
first = (first + 1) % block_window; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
convolve_window(i, j, dist_sums, velx(i, j), vely(i, j)); |
||||
} |
||||
} |
||||
} |
||||
|
||||
}; |
||||
|
||||
template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely) |
||||
{ |
||||
fbm(velx, vely); |
||||
} |
||||
|
||||
void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows) |
||||
{ |
||||
dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS)); |
||||
|
||||
buffer_cols = search_window * search_window * grid.y; |
||||
buffer_rows = src_cols + block_window * grid.x; |
||||
} |
||||
|
||||
template <typename T> |
||||
void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream) |
||||
{ |
||||
FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer); |
||||
|
||||
dim3 block(CTA_SIZE, 1); |
||||
dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS)); |
||||
|
||||
size_t smem = search_window * search_window * sizeof(int); |
||||
|
||||
optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely); |
||||
cudaSafeCall ( cudaGetLastError () ); |
||||
|
||||
if (stream == 0) |
||||
cudaSafeCall( cudaDeviceSynchronize() ); |
||||
} |
||||
|
||||
template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif // !defined CUDA_DISABLER |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
@ -0,0 +1,53 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if !defined CUDA_DISABLER |
||||
|
||||
#include "row_filter.h" |
||||
|
||||
namespace filter |
||||
{ |
||||
template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); |
||||
} |
||||
|
||||
#endif /* CUDA_DISABLER */ |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue