From a80e0cf8bd85cf5aaf4b1746d9d85f0f1f50c88f Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Wed, 9 Sep 2015 11:54:42 -0400
Subject: [PATCH 01/10] Added tutorials for using thrust.

---
 .../gpu/gpu-thrust-interop/CMakeLists.txt     | 10 +++
 .../gpu/gpu-thrust-interop/Thrust_interop.hpp | 74 ++++++++++++++++
 .../gpu/gpu-thrust-interop/main.cu            | 88 +++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 samples/cpp/tutorial_code/gpu/gpu-thrust-interop/CMakeLists.txt
 create mode 100644 samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
 create mode 100644 samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/CMakeLists.txt b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/CMakeLists.txt
new file mode 100644
index 0000000000..037d508569
--- /dev/null
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/CMakeLists.txt
@@ -0,0 +1,10 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+FIND_PACKAGE(CUDA REQUIRED)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+
+FIND_PACKAGE(OpenCV REQUIRED COMPONENTS core)
+INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
+
+CUDA_ADD_EXECUTABLE(opencv_thrust main.cu)
+TARGET_LINK_LIBRARIES(opencv_thrust ${OpenCV_LIBS})
\ No newline at end of file
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
new file mode 100644
index 0000000000..b4d0d7c4d2
--- /dev/null
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -0,0 +1,74 @@
+#pragma once
+#include <opencv2/core/cuda.hpp>
+
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/device_ptr.h>
+
+template<typename T> struct
+CV_TYPE
+{
+	static const int DEPTH;
+};
+
+template<> static const int CV_TYPE<float>::DEPTH = CV_32F;
+template<> static const int CV_TYPE<double>::DEPTH = CV_64F;
+template<> static const int CV_TYPE<int>::DEPTH = CV_32S;
+template<> static const int CV_TYPE<uchar>::DEPTH = CV_8U;
+template<> static const int CV_TYPE<char>::DEPTH = CV_8S;
+template<> static const int CV_TYPE<ushort>::DEPTH = CV_16U;
+template<> static const int CV_TYPE<short>::DEPTH = CV_16S;
+
+template<typename T> struct step_functor : public thrust::unary_function<int, int>
+{
+	int columns;
+	int step;
+	int channels;
+	__host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
+	__host__ step_functor(cv::cuda::GpuMat& mat)
+	{
+		CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+		columns = mat.cols;
+		step = mat.step / sizeof(T);
+		channels = mat.channels();		
+	}
+	__host__ __device__
+		int operator()(int x) const
+	{
+		int row = x / columns;
+		int idx = (row * step) + (x % columns)*channels;
+		return idx;
+	}
+};
+
+/*
+	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.  
+	@Param mat is the input matrix
+	@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
+*/
+template<typename T>
+thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
+{
+	if (channel == -1)
+		mat = mat.reshape(1);
+	CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+	CV_Assert(channel < mat.channels());
+	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+		thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+}
+/*
+@Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
+@Param mat is the input matrix
+@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
+*/
+template<typename T>
+thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
+{
+	if (channel == -1)
+		mat = mat.reshape(1);
+	CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+	CV_Assert(channel < mat.channels());
+	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+		thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+}
\ No newline at end of file
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
new file mode 100644
index 0000000000..c6784fef0b
--- /dev/null
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
@@ -0,0 +1,88 @@
+#include "Thrust_interop.hpp"
+
+#include <thrust/transform.h>
+#include <thrust/random.h>
+#include <thrust/sort.h>
+struct prg
+{
+	float a, b;
+	
+	__host__ __device__
+		prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
+
+	__host__ __device__
+		float operator()(const unsigned int n) const
+	{
+		thrust::default_random_engine rng;
+		thrust::uniform_real_distribution<float> dist(a, b);
+		rng.discard(n);
+
+		return dist(rng);
+	}
+};
+
+template<typename T> struct pred_eq
+{
+	T value;
+	int channel;
+	__host__ __device__
+		pred_eq(T value_, int channel_ = 0) :value(value_), channel(channel_){}
+	__host__ __device__
+		bool operator()(const T val) const
+	{
+		return val == value;
+	}
+	template<int N>
+	__host__ __device__ bool operator()(const cv::Vec<T, N>& val)
+	{
+		if (channel < N)
+			return val.val[channel] == value;
+		return false;
+	}
+};
+
+
+int main(void)
+{
+	// Generate a 2 channel row matrix with 100 elements.  Set the first channel to be the element index, and the second to be a randomly
+	// generated value.  Sort by the randomly generated value while maintaining index association.
+	{
+		cv::cuda::GpuMat d_idx(1, 100, CV_32SC2);
+
+		auto keyBegin = GpuMatBeginItr<int>(d_idx, 1);
+		auto keyEnd = GpuMatEndItr<int>(d_idx, 1);
+
+		auto idxBegin = GpuMatBeginItr<int>(d_idx, 0);
+		auto idxEnd = GpuMatEndItr<int>(d_idx, 0);
+
+		thrust::sequence(idxBegin, idxEnd);
+		thrust::transform(idxBegin, idxEnd, keyBegin, prg(0, 10));
+		thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
+
+		cv::Mat h_idx(d_idx);
+	}
+
+	// Randomly fill a row matrix with 100 elements between -1 and 1
+	{
+		cv::cuda::GpuMat d_value(1, 100, CV_32F);
+		auto valueBegin = GpuMatBeginItr<float>(d_value);
+		auto valueEnd = GpuMatEndItr<float>(d_value);
+		thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
+
+		cv::Mat h_value(d_value);
+	}
+
+	// OpenCV has count non zero, but what if you want to count a specific value?
+	{
+		cv::cuda::GpuMat d_value(1, 100, CV_32S);
+		d_value.setTo(cv::Scalar(0));
+		d_value.colRange(10, 50).setTo(cv::Scalar(15));
+		auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
+		std::cout << count << std::endl;
+	}
+
+	
+
+
+	return 0;
+}

From 09d392f09de384d7df6e04cdb46d1a469549e3a4 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Tue, 15 Sep 2015 12:17:30 -0400
Subject: [PATCH 02/10] Added thrust tutorial.

---
 .../gpu/gpu-thrust-interop/main.cu            | 38 +++++++++++++++++--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
index c6784fef0b..e138fcc6fa 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
@@ -1,12 +1,14 @@
 #include "Thrust_interop.hpp"
+#include <opencv2/core/cuda_stream_accessor.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/random.h>
 #include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
 struct prg
 {
 	float a, b;
-	
+
 	__host__ __device__
 		prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
 
@@ -39,6 +41,25 @@ template<typename T> struct pred_eq
 			return val.val[channel] == value;
 		return false;
 	}
+
+	__host__ __device__ bool operator()( const thrust::tuple<T, T, T>& val)
+	{
+		if (channel == 0)
+			return thrust::get<0>(val) == value;
+		if (channel == 1)
+			return thrust::get<1>(val) == value;
+		if (channel == 2)
+			return thrust::get<2>(val) == value;
+	}
+};
+template<typename T> struct pred_greater
+{
+	T value;
+	__host__ __device__ pred_greater(T value_) : value(value_){}
+	__host__ __device__ bool operator()(const T& val) const
+	{
+		return val > value;
+	}
 };
 
 
@@ -80,9 +101,18 @@ int main(void)
 		auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
 		std::cout << count << std::endl;
 	}
-
+	// Randomly fill an array then copy only values greater than 0.  Perform these tasks on a stream.
+	{
+		cv::cuda::GpuMat d_value(1, 100, CV_32F);
+		auto valueBegin = GpuMatBeginItr<float>(d_value);
+		auto valueEnd = GpuMatEndItr<float>(d_value);
+		cv::cuda::Stream stream;
+		thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
+		int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
+		cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
+		thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
+		cv::Mat h_greater(d_valueGreater);
+	}
 	
-
-
 	return 0;
 }

From 23fc5930b78e2d2536fb110082acf0fc5a1e1c71 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Wed, 16 Sep 2015 12:03:35 -0400
Subject: [PATCH 03/10] Improved thrust interop tutorial.

---
 .gitignore                                    |  3 +-
 .../gpu_thrust_interop.markdown               | 73 +++++++++++++++++++
 .../gpu/gpu-thrust-interop/Thrust_interop.hpp | 32 ++++----
 .../gpu/gpu-thrust-interop/main.cu            | 43 ++++-------
 4 files changed, 104 insertions(+), 47 deletions(-)
 create mode 100644 doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown

diff --git a/.gitignore b/.gitignore
index 039d2400a6..adef6e08d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,5 @@ bin/
 CMakeCache.txt
 *.suo
 *.log
-*.tlog
\ No newline at end of file
+*.tlog
+build
diff --git a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
new file mode 100644
index 0000000000..361848535f
--- /dev/null
+++ b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
@@ -0,0 +1,73 @@
+Using a cv::cuda::GpuMat with thrust
+===========================================
+
+Goal
+----
+
+Thrust is an extremely powerful library for various cuda accelerated algorithms.  However thrust is designed
+to work with vectors and not pitched matricies.  The following tutorial will discuss wrapping cv::cuda::GpuMat's
+into thrust iterators that can be used with thrust algorithms.
+
+This tutorial should show you how to:
+- Wrap a GpuMat into a thrust iterator
+- Fill a GpuMat with random numbers
+- Sort a column of a GpuMat in place
+- Copy values greater than 0 to a new gpu matrix
+- Use streams with thrust
+
+Wrapping a GpuMat into a thrust iterator
+----
+
+The following code will produce an iterator for a GpuMat
+
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp begin_itr
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp end_itr
+
+Our goal is to have an iterator that will start at the beginning of the matrix, and increment correctly to access continuous matrix elements.  This is trivial for a continuous row, but how about for a column
+of a pitched matrix?  To do this we need the iterator to be aware of the matrix dimensions and step.  This information is embedded in the step_functor.  
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp step_functor
+The step functor takes in an index value and returns the appropriate
+offset from the beginning of the matrix.  The counting iterator simply increments over the range of pixel elements.  Combined into the transform_iterator we have an iterator that counts from 0 to M*N and correctly
+increments to account for the pitched memory of a GpuMat.  Unfortunately this does not include any memory location information, for that we need a thrust::device_ptr.  By combining a device pointer with the 
+transform_iterator we can point thrust to the first element of our matrix and have it step accordingly.
+
+Fill a GpuMat with random numbers
+----
+Now that we have some nice functions for making iterators for thrust, lets use them to do some things OpenCV can't do.  Unfortunately at the time of this writing, OpenCV doesn't have any Gpu random number generation.
+Thankfully thrust does and it's now trivial to interop between the two.
+Example taken from http://stackoverflow.com/questions/12614164/generating-a-random-number-vector-between-0-and-1-0-using-thrust
+
+First we need to write a functor that will produce our random values.
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu prg
+
+This will take in an integer value and output a value between a and b.
+Now we will populate our matrix with values between 0 and 10 with a thrust transform.
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu random
+
+Sort a column of a GpuMat in place
+----
+
+Lets fill matrix elements with random values and an index.  Afterwards we will sort the random numbers and the indecies. 
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu sort
+
+Copy values greater than 0 to a new gpu matrix while using streams
+----
+In this example we're going to see how cv::cuda::Streams can be used with thrust.  Unfortunately this specific example uses functions that must return 
+results to the CPU so it isn't the optimal use of streams.
+
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu copy_greater
+
+
+First we will populate a GPU mat with randomly generated data between -1 and 1 on a stream.
+
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu random_gen_stream
+
+Notice the use of thrust::system::cuda::par.on(...), this creates an execution policy for executing thrust code on a stream.
+There is a bug in the version of thrust distributed with the cuda toolkit, as of version 7.5 this has not been fixed.  This bug causes code to not execute on streams.
+The bug can however be fixed by using the newest version of thrust from the git repository. (http://github.com/thrust/thrust.git)
+Next we will determine how many values are greater than 0 by using thrust::count_if with the following predicate:
+
+@snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu pred_greater
+
+We will use those results to create an output buffer for storing the copied values, we will then use copy_if with the same predicate to populate the output buffer.
+Lastly we will download the values into a CPU mat for viewing.
\ No newline at end of file
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index b4d0d7c4d2..5cf0234b6d 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -6,20 +6,10 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/device_ptr.h>
 
-template<typename T> struct
-CV_TYPE
-{
-	static const int DEPTH;
-};
-
-template<> static const int CV_TYPE<float>::DEPTH = CV_32F;
-template<> static const int CV_TYPE<double>::DEPTH = CV_64F;
-template<> static const int CV_TYPE<int>::DEPTH = CV_32S;
-template<> static const int CV_TYPE<uchar>::DEPTH = CV_8U;
-template<> static const int CV_TYPE<char>::DEPTH = CV_8S;
-template<> static const int CV_TYPE<ushort>::DEPTH = CV_16U;
-template<> static const int CV_TYPE<short>::DEPTH = CV_16S;
-
+/*
+	@Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
+*/
+//! [step_functor]
 template<typename T> struct step_functor : public thrust::unary_function<int, int>
 {
 	int columns;
@@ -41,7 +31,8 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
 		return idx;
 	}
 };
-
+//! [step_functor]
+//! [begin_itr]
 /*
 	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.  
 	@Param mat is the input matrix
@@ -52,11 +43,13 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
 {
 	if (channel == -1)
 		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+	CV_Assert(mat.depth() == cv::DataType<T>::depth);
 	CV_Assert(channel < mat.channels());
 	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
 		thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
+//! [begin_itr]
+//! [end_itr]
 /*
 @Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
 @Param mat is the input matrix
@@ -67,8 +60,11 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
 {
 	if (channel == -1)
 		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+	CV_Assert(mat.depth() == cv::DataType<T>::depth);
 	CV_Assert(channel < mat.channels());
 	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
 		thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
-}
\ No newline at end of file
+}
+//! [end_itr]
+
+
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
index e138fcc6fa..7f571cdca6 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
@@ -5,6 +5,7 @@
 #include <thrust/random.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
+//! [prg]
 struct prg
 {
 	float a, b;
@@ -22,36 +23,10 @@ struct prg
 		return dist(rng);
 	}
 };
+//! [prg]
 
-template<typename T> struct pred_eq
-{
-	T value;
-	int channel;
-	__host__ __device__
-		pred_eq(T value_, int channel_ = 0) :value(value_), channel(channel_){}
-	__host__ __device__
-		bool operator()(const T val) const
-	{
-		return val == value;
-	}
-	template<int N>
-	__host__ __device__ bool operator()(const cv::Vec<T, N>& val)
-	{
-		if (channel < N)
-			return val.val[channel] == value;
-		return false;
-	}
 
-	__host__ __device__ bool operator()( const thrust::tuple<T, T, T>& val)
-	{
-		if (channel == 0)
-			return thrust::get<0>(val) == value;
-		if (channel == 1)
-			return thrust::get<1>(val) == value;
-		if (channel == 2)
-			return thrust::get<2>(val) == value;
-	}
-};
+//! [pred_greater]
 template<typename T> struct pred_greater
 {
 	T value;
@@ -61,12 +36,14 @@ template<typename T> struct pred_greater
 		return val > value;
 	}
 };
+//! [pred_greater]
 
 
 int main(void)
 {
 	// Generate a 2 channel row matrix with 100 elements.  Set the first channel to be the element index, and the second to be a randomly
 	// generated value.  Sort by the randomly generated value while maintaining index association.
+	//! [sort]
 	{
 		cv::cuda::GpuMat d_idx(1, 100, CV_32SC2);
 
@@ -82,8 +59,10 @@ int main(void)
 
 		cv::Mat h_idx(d_idx);
 	}
+	//! [sort]
 
 	// Randomly fill a row matrix with 100 elements between -1 and 1
+	//! [random]
 	{
 		cv::cuda::GpuMat d_value(1, 100, CV_32F);
 		auto valueBegin = GpuMatBeginItr<float>(d_value);
@@ -92,8 +71,10 @@ int main(void)
 
 		cv::Mat h_value(d_value);
 	}
+	//! [random]
 
 	// OpenCV has count non zero, but what if you want to count a specific value?
+	//! [count_value]
 	{
 		cv::cuda::GpuMat d_value(1, 100, CV_32S);
 		d_value.setTo(cv::Scalar(0));
@@ -101,18 +82,24 @@ int main(void)
 		auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
 		std::cout << count << std::endl;
 	}
+	//! [count_value]
+
 	// Randomly fill an array then copy only values greater than 0.  Perform these tasks on a stream.
+	//! [copy_greater]
 	{
 		cv::cuda::GpuMat d_value(1, 100, CV_32F);
 		auto valueBegin = GpuMatBeginItr<float>(d_value);
 		auto valueEnd = GpuMatEndItr<float>(d_value);
 		cv::cuda::Stream stream;
+		//! [random_gen_stream]
 		thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
+		//! [random_gen_stream]
 		int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
 		cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
 		thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
 		cv::Mat h_greater(d_valueGreater);
 	}
+	//! [copy_greater]
 	
 	return 0;
 }

From 00c2930709a28762e3aa171b769990d4772602f2 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Thu, 24 Sep 2015 09:19:37 -0400
Subject: [PATCH 04/10] improved comments.

---
 .../gpu/gpu-thrust-interop/main.cu            | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
index 7f571cdca6..ffe543cbc2 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
@@ -45,19 +45,21 @@ int main(void)
 	// generated value.  Sort by the randomly generated value while maintaining index association.
 	//! [sort]
 	{
-		cv::cuda::GpuMat d_idx(1, 100, CV_32SC2);
-
-		auto keyBegin = GpuMatBeginItr<int>(d_idx, 1);
-		auto keyEnd = GpuMatEndItr<int>(d_idx, 1);
-
-		auto idxBegin = GpuMatBeginItr<int>(d_idx, 0);
-		auto idxEnd = GpuMatEndItr<int>(d_idx, 0);
-
+		cv::cuda::GpuMat d_data(1, 100, CV_32SC2);
+		// Thrust compatible begin and end iterators to channel 1 of this matrix
+		auto keyBegin = GpuMatBeginItr<int>(d_data, 1);
+		auto keyEnd = GpuMatEndItr<int>(d_data, 1);
+		// Thrust compatible begin and end iterators to channel 0 of this matrix
+		auto idxBegin = GpuMatBeginItr<int>(d_data, 0);
+		auto idxEnd = GpuMatEndItr<int>(d_data, 0);
+		// Fill the index channel with a sequence of numbers from 0 to 100
 		thrust::sequence(idxBegin, idxEnd);
-		thrust::transform(idxBegin, idxEnd, keyBegin, prg(0, 10));
+		// Fill the key channel with random numbers between 0 and 10.  A counting iterator is used here to give an integer value for each location as an input to prg::operator()
+		thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_data.cols), keyBegin, prg(0, 10));
+		// Sort the key channel and index channel such that the keys and indecies stay together
 		thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
 
-		cv::Mat h_idx(d_idx);
+		cv::Mat h_idx(d_data);
 	}
 	//! [sort]
 
@@ -92,10 +94,14 @@ int main(void)
 		auto valueEnd = GpuMatEndItr<float>(d_value);
 		cv::cuda::Stream stream;
 		//! [random_gen_stream]
+		// Same as the random generation code from before except now the transformation is being performed on a stream
 		thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
 		//! [random_gen_stream]
+		// Count the number of values we are going to copy
 		int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
+		// Allocate a destination for copied values
 		cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
+		// Copy values that satisfy the predicate.
 		thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
 		cv::Mat h_greater(d_valueGreater);
 	}

From 12dcb1555e18be0ca95944b3289671072533f88b Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Thu, 24 Sep 2015 10:01:26 -0400
Subject: [PATCH 05/10] Missed one conversion of CV_TYPE to cv::DataType

---
 .../cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 5cf0234b6d..46f1bc5485 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -18,7 +18,7 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
 	__host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
 	__host__ step_functor(cv::cuda::GpuMat& mat)
 	{
-		CV_Assert(mat.depth() == CV_TYPE<T>::DEPTH);
+		CV_Assert(mat.depth() == cv::DataType<T>::depth);
 		columns = mat.cols;
 		step = mat.step / sizeof(T);
 		channels = mat.channels();		

From 7376c5311bd9132e53d22479d39920797a98d836 Mon Sep 17 00:00:00 2001
From: Dan Moodie <dtmoodie@gmail.com>
Date: Thu, 24 Sep 2015 14:29:17 -0400
Subject: [PATCH 06/10] Fixed tabs in whitespace.

---
 .../gpu_thrust_interop.markdown               |  11 +-
 .../gpu/gpu-thrust-interop/Thrust_interop.hpp |  74 ++++----
 .../gpu/gpu-thrust-interop/main.cu            | 159 +++++++++---------
 3 files changed, 120 insertions(+), 124 deletions(-)

diff --git a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
index 361848535f..64f763bd59 100644
--- a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
+++ b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
@@ -23,13 +23,11 @@ The following code will produce an iterator for a GpuMat
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp begin_itr
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp end_itr
 
-Our goal is to have an iterator that will start at the beginning of the matrix, and increment correctly to access continuous matrix elements.  This is trivial for a continuous row, but how about for a column
-of a pitched matrix?  To do this we need the iterator to be aware of the matrix dimensions and step.  This information is embedded in the step_functor.  
+Our goal is to have an iterator that will start at the beginning of the matrix, and increment correctly to access continuous matrix elements.  This is trivial for a continuous row, but how about for a column of a pitched matrix?  To do this we need the iterator to be aware of the matrix dimensions and step.  This information is embedded in the step_functor.
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp step_functor
 The step functor takes in an index value and returns the appropriate
 offset from the beginning of the matrix.  The counting iterator simply increments over the range of pixel elements.  Combined into the transform_iterator we have an iterator that counts from 0 to M*N and correctly
-increments to account for the pitched memory of a GpuMat.  Unfortunately this does not include any memory location information, for that we need a thrust::device_ptr.  By combining a device pointer with the 
-transform_iterator we can point thrust to the first element of our matrix and have it step accordingly.
+increments to account for the pitched memory of a GpuMat.  Unfortunately this does not include any memory location information, for that we need a thrust::device_ptr.  By combining a device pointer with the transform_iterator we can point thrust to the first element of our matrix and have it step accordingly.
 
 Fill a GpuMat with random numbers
 ----
@@ -47,13 +45,12 @@ Now we will populate our matrix with values between 0 and 10 with a thrust trans
 Sort a column of a GpuMat in place
 ----
 
-Lets fill matrix elements with random values and an index.  Afterwards we will sort the random numbers and the indecies. 
+Lets fill matrix elements with random values and an index.  Afterwards we will sort the random numbers and the indecies.
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu sort
 
 Copy values greater than 0 to a new gpu matrix while using streams
 ----
-In this example we're going to see how cv::cuda::Streams can be used with thrust.  Unfortunately this specific example uses functions that must return 
-results to the CPU so it isn't the optimal use of streams.
+In this example we're going to see how cv::cuda::Streams can be used with thrust.  Unfortunately this specific example uses functions that must return results to the CPU so it isn't the optimal use of streams.
 
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu copy_greater
 
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 46f1bc5485..263983b6be 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -7,63 +7,63 @@
 #include <thrust/device_ptr.h>
 
 /*
-	@Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
+    @Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
 */
 //! [step_functor]
 template<typename T> struct step_functor : public thrust::unary_function<int, int>
 {
-	int columns;
-	int step;
-	int channels;
-	__host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
-	__host__ step_functor(cv::cuda::GpuMat& mat)
-	{
-		CV_Assert(mat.depth() == cv::DataType<T>::depth);
-		columns = mat.cols;
-		step = mat.step / sizeof(T);
-		channels = mat.channels();		
-	}
-	__host__ __device__
-		int operator()(int x) const
-	{
-		int row = x / columns;
-		int idx = (row * step) + (x % columns)*channels;
-		return idx;
-	}
+    int columns;
+    int step;
+    int channels;
+    __host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
+    __host__ step_functor(cv::cuda::GpuMat& mat)
+    {
+        CV_Assert(mat.depth() == cv::DataType<T>::depth);
+        columns = mat.cols;
+        step = mat.step / sizeof(T);
+        channels = mat.channels();		
+    }
+    __host__ __device__
+      int operator()(int x) const
+    {
+        int row = x / columns;
+        int idx = (row * step) + (x % columns)*channels;
+        return idx;
+    }
 };
 //! [step_functor]
 //! [begin_itr]
 /*
-	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.  
-	@Param mat is the input matrix
-	@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
+    @Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.  
+    @Param mat is the input matrix
+    @Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
 */
 template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
 {
-	if (channel == -1)
-		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == cv::DataType<T>::depth);
-	CV_Assert(channel < mat.channels());
-	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
-		thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+    if (channel == -1)
+        mat = mat.reshape(1);
+    CV_Assert(mat.depth() == cv::DataType<T>::depth);
+    CV_Assert(channel < mat.channels());
+    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+        thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
 //! [begin_itr]
 //! [end_itr]
 /*
-@Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
-@Param mat is the input matrix
-@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
+    @Brief GpuMatEndItr returns a thrust compatible iterator to the end of a GPU mat's memory.
+    @Param mat is the input matrix
+    @Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
 */
 template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
 {
-	if (channel == -1)
-		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == cv::DataType<T>::depth);
-	CV_Assert(channel < mat.channels());
-	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
-		thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+    if (channel == -1)
+        mat = mat.reshape(1);
+    CV_Assert(mat.depth() == cv::DataType<T>::depth);
+    CV_Assert(channel < mat.channels());
+    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+        thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
 //! [end_itr]
 
diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
index ffe543cbc2..51f246a37d 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu
@@ -8,20 +8,19 @@
 //! [prg]
 struct prg
 {
-	float a, b;
+  float a, b;
 
-	__host__ __device__
-		prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
+  __host__ __device__
+    prg(float _a = 0.f, float _b = 1.f) : a(_a), b(_b) {};
 
-	__host__ __device__
-		float operator()(const unsigned int n) const
-	{
-		thrust::default_random_engine rng;
-		thrust::uniform_real_distribution<float> dist(a, b);
-		rng.discard(n);
-
-		return dist(rng);
-	}
+  __host__ __device__
+    float operator()(const unsigned int n) const
+  {
+    thrust::default_random_engine rng;
+    thrust::uniform_real_distribution<float> dist(a, b);
+    rng.discard(n);
+    return dist(rng);
+  }
 };
 //! [prg]
 
@@ -29,83 +28,83 @@ struct prg
 //! [pred_greater]
 template<typename T> struct pred_greater
 {
-	T value;
-	__host__ __device__ pred_greater(T value_) : value(value_){}
-	__host__ __device__ bool operator()(const T& val) const
-	{
-		return val > value;
-	}
+  T value;
+  __host__ __device__ pred_greater(T value_) : value(value_){}
+  __host__ __device__ bool operator()(const T& val) const
+  {
+    return val > value;
+  }
 };
 //! [pred_greater]
 
 
 int main(void)
 {
-	// Generate a 2 channel row matrix with 100 elements.  Set the first channel to be the element index, and the second to be a randomly
-	// generated value.  Sort by the randomly generated value while maintaining index association.
-	//! [sort]
-	{
-		cv::cuda::GpuMat d_data(1, 100, CV_32SC2);
-		// Thrust compatible begin and end iterators to channel 1 of this matrix
-		auto keyBegin = GpuMatBeginItr<int>(d_data, 1);
-		auto keyEnd = GpuMatEndItr<int>(d_data, 1);
-		// Thrust compatible begin and end iterators to channel 0 of this matrix
-		auto idxBegin = GpuMatBeginItr<int>(d_data, 0);
-		auto idxEnd = GpuMatEndItr<int>(d_data, 0);
-		// Fill the index channel with a sequence of numbers from 0 to 100
-		thrust::sequence(idxBegin, idxEnd);
-		// Fill the key channel with random numbers between 0 and 10.  A counting iterator is used here to give an integer value for each location as an input to prg::operator()
-		thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_data.cols), keyBegin, prg(0, 10));
-		// Sort the key channel and index channel such that the keys and indecies stay together
-		thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
+  // Generate a 2 channel row matrix with 100 elements.  Set the first channel to be the element index, and the second to be a randomly
+  // generated value.  Sort by the randomly generated value while maintaining index association.
+  //! [sort]
+  {
+    cv::cuda::GpuMat d_data(1, 100, CV_32SC2);
+    // Thrust compatible begin and end iterators to channel 1 of this matrix
+    auto keyBegin = GpuMatBeginItr<int>(d_data, 1);
+    auto keyEnd = GpuMatEndItr<int>(d_data, 1);
+    // Thrust compatible begin and end iterators to channel 0 of this matrix
+    auto idxBegin = GpuMatBeginItr<int>(d_data, 0);
+    auto idxEnd = GpuMatEndItr<int>(d_data, 0);
+    // Fill the index channel with a sequence of numbers from 0 to 100
+    thrust::sequence(idxBegin, idxEnd);
+    // Fill the key channel with random numbers between 0 and 10.  A counting iterator is used here to give an integer value for each location as an input to prg::operator()
+    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_data.cols), keyBegin, prg(0, 10));
+    // Sort the key channel and index channel such that the keys and indecies stay together
+    thrust::sort_by_key(keyBegin, keyEnd, idxBegin);
+
+    cv::Mat h_idx(d_data);
+  }
+  //! [sort]
 
-		cv::Mat h_idx(d_data);
-	}
-	//! [sort]
+  // Randomly fill a row matrix with 100 elements between -1 and 1
+  //! [random]
+  {
+    cv::cuda::GpuMat d_value(1, 100, CV_32F);
+    auto valueBegin = GpuMatBeginItr<float>(d_value);
+    auto valueEnd = GpuMatEndItr<float>(d_value);
+    thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
 
-	// Randomly fill a row matrix with 100 elements between -1 and 1
-	//! [random]
-	{
-		cv::cuda::GpuMat d_value(1, 100, CV_32F);
-		auto valueBegin = GpuMatBeginItr<float>(d_value);
-		auto valueEnd = GpuMatEndItr<float>(d_value);
-		thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
+    cv::Mat h_value(d_value);
+  }
+  //! [random]
 
-		cv::Mat h_value(d_value);
-	}
-	//! [random]
+  // OpenCV has count non zero, but what if you want to count a specific value?
+  //! [count_value]
+  {
+    cv::cuda::GpuMat d_value(1, 100, CV_32S);
+    d_value.setTo(cv::Scalar(0));
+    d_value.colRange(10, 50).setTo(cv::Scalar(15));
+    auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
+    std::cout << count << std::endl;
+  }
+  //! [count_value]
 
-	// OpenCV has count non zero, but what if you want to count a specific value?
-	//! [count_value]
-	{
-		cv::cuda::GpuMat d_value(1, 100, CV_32S);
-		d_value.setTo(cv::Scalar(0));
-		d_value.colRange(10, 50).setTo(cv::Scalar(15));
-		auto count = thrust::count(GpuMatBeginItr<int>(d_value), GpuMatEndItr<int>(d_value), 15);
-		std::cout << count << std::endl;
-	}
-	//! [count_value]
+  // Randomly fill an array then copy only values greater than 0.  Perform these tasks on a stream.
+  //! [copy_greater]
+  {
+    cv::cuda::GpuMat d_value(1, 100, CV_32F);
+    auto valueBegin = GpuMatBeginItr<float>(d_value);
+    auto valueEnd = GpuMatEndItr<float>(d_value);
+    cv::cuda::Stream stream;
+    //! [random_gen_stream]
+    // Same as the random generation code from before except now the transformation is being performed on a stream
+    thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
+    //! [random_gen_stream]
+    // Count the number of values we are going to copy
+    int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
+    // Allocate a destination for copied values
+    cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
+    // Copy values that satisfy the predicate.
+    thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
+    cv::Mat h_greater(d_valueGreater);
+  }
+  //! [copy_greater]
 
-	// Randomly fill an array then copy only values greater than 0.  Perform these tasks on a stream.
-	//! [copy_greater]
-	{
-		cv::cuda::GpuMat d_value(1, 100, CV_32F);
-		auto valueBegin = GpuMatBeginItr<float>(d_value);
-		auto valueEnd = GpuMatEndItr<float>(d_value);
-		cv::cuda::Stream stream;
-		//! [random_gen_stream]
-		// Same as the random generation code from before except now the transformation is being performed on a stream
-		thrust::transform(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), thrust::make_counting_iterator(0), thrust::make_counting_iterator(d_value.cols), valueBegin, prg(-1, 1));
-		//! [random_gen_stream]
-		// Count the number of values we are going to copy
-		int count = thrust::count_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, pred_greater<float>(0.0));
-		// Allocate a destination for copied values
-		cv::cuda::GpuMat d_valueGreater(1, count, CV_32F);
-		// Copy values that satisfy the predicate.
-		thrust::copy_if(thrust::system::cuda::par.on(cv::cuda::StreamAccessor::getStream(stream)), valueBegin, valueEnd, GpuMatBeginItr<float>(d_valueGreater), pred_greater<float>(0.0));
-		cv::Mat h_greater(d_valueGreater);
-	}
-	//! [copy_greater]
-	
-	return 0;
+  return 0;
 }

From ab84de967ef4639130485077a2613b6897033283 Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Fri, 25 Sep 2015 08:56:37 -0400
Subject: [PATCH 07/10] More whitespace fixes.

---
 .../gpu/gpu-thrust-interop/Thrust_interop.hpp             | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 46f1bc5485..92cdca0463 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -21,7 +21,7 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
 		CV_Assert(mat.depth() == cv::DataType<T>::depth);
 		columns = mat.cols;
 		step = mat.step / sizeof(T);
-		channels = mat.channels();		
+		channels = mat.channels();
 	}
 	__host__ __device__
 		int operator()(int x) const
@@ -34,7 +34,7 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
 //! [step_functor]
 //! [begin_itr]
 /*
-	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.  
+	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.
 	@Param mat is the input matrix
 	@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
 */
@@ -65,6 +65,4 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
 	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
 		thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
-//! [end_itr]
-
-
+//! [end_itr]
\ No newline at end of file

From e4184195a28cf25a54ba1634806f9677b9509e3c Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Mon, 28 Sep 2015 09:37:10 -0400
Subject: [PATCH 08/10] Looks like something automatically added tabs back in
 after I already fixed the white space. :/

---
 .../gpu/gpu-thrust-interop/Thrust_interop.hpp | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 92cdca0463..4f44d51ce8 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -7,46 +7,46 @@
 #include <thrust/device_ptr.h>
 
 /*
-	@Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
+    @Brief step_functor is an object to correctly step a thrust iterator according to the stride of a matrix
 */
 //! [step_functor]
 template<typename T> struct step_functor : public thrust::unary_function<int, int>
 {
-	int columns;
-	int step;
-	int channels;
-	__host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
-	__host__ step_functor(cv::cuda::GpuMat& mat)
-	{
-		CV_Assert(mat.depth() == cv::DataType<T>::depth);
-		columns = mat.cols;
-		step = mat.step / sizeof(T);
-		channels = mat.channels();
-	}
-	__host__ __device__
-		int operator()(int x) const
-	{
-		int row = x / columns;
-		int idx = (row * step) + (x % columns)*channels;
-		return idx;
-	}
+    int columns;
+    int step;
+    int channels;
+    __host__ __device__ step_functor(int columns_, int step_, int channels_ = 1) : columns(columns_), step(step_), channels(channels_)	{	};
+    __host__ step_functor(cv::cuda::GpuMat& mat)
+    {
+        CV_Assert(mat.depth() == cv::DataType<T>::depth);
+        columns = mat.cols;
+        step = mat.step / sizeof(T);
+        channels = mat.channels();
+    }
+    __host__ __device__
+        int operator()(int x) const
+    {
+        int row = x / columns;
+        int idx = (row * step) + (x % columns)*channels; 
+        return idx;
+    }
 };
 //! [step_functor]
 //! [begin_itr]
 /*
-	@Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.
-	@Param mat is the input matrix
-	@Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
+    @Brief GpuMatBeginItr returns a thrust compatible iterator to the beginning of a GPU mat's memory.
+    @Param mat is the input matrix
+    @Param channel is the channel of the matrix that the iterator is accessing.  If set to -1, the iterator will access every element in sequential order
 */
 template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
 {
-	if (channel == -1)
-		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == cv::DataType<T>::depth);
-	CV_Assert(channel < mat.channels());
-	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
-		thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+    if (channel == -1)
+        mat = mat.reshape(1);
+    CV_Assert(mat.depth() == cv::DataType<T>::depth);
+    CV_Assert(channel < mat.channels());
+    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+        thrust::make_transform_iterator(thrust::make_counting_iterator(0), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
 //! [begin_itr]
 //! [end_itr]
@@ -58,11 +58,11 @@ thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<s
 template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
 {
-	if (channel == -1)
-		mat = mat.reshape(1);
-	CV_Assert(mat.depth() == cv::DataType<T>::depth);
-	CV_Assert(channel < mat.channels());
-	return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
-		thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
+    if (channel == -1)
+        mat = mat.reshape(1);
+    CV_Assert(mat.depth() == cv::DataType<T>::depth);
+    CV_Assert(channel < mat.channels());
+    return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
+        thrust::make_transform_iterator(thrust::make_counting_iterator(mat.rows*mat.cols), step_functor<T>(mat.cols, mat.step / sizeof(T), mat.channels())));
 }
 //! [end_itr]
\ No newline at end of file

From f332f98757174a94e493dd54dc1ac17fc22238b2 Mon Sep 17 00:00:00 2001
From: Dan Moodie <dtmoodie@gmail.com>
Date: Mon, 5 Oct 2015 20:21:46 -0400
Subject: [PATCH 09/10] Removed last trailing whitespace. Can't believe I
 missed it.

---
 .../cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 4f44d51ce8..043c854297 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -27,7 +27,7 @@ template<typename T> struct step_functor : public thrust::unary_function<int, in
         int operator()(int x) const
     {
         int row = x / columns;
-        int idx = (row * step) + (x % columns)*channels; 
+        int idx = (row * step) + (x % columns)*channels;
         return idx;
     }
 };

From 01f40cfafc8c036fdd108a9dfddc04c1b037f78c Mon Sep 17 00:00:00 2001
From: Dan <dtmoodie>
Date: Tue, 10 Nov 2015 12:29:29 -0500
Subject: [PATCH 10/10] Corrected the case where channel == -1

---
 .../tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
index 4f44d51ce8..7e880b88eb 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-thrust-interop/Thrust_interop.hpp
@@ -42,7 +42,10 @@ template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatBeginItr(cv::cuda::GpuMat mat, int channel = 0)
 {
     if (channel == -1)
+    {
         mat = mat.reshape(1);
+        channel = 0;
+    }
     CV_Assert(mat.depth() == cv::DataType<T>::depth);
     CV_Assert(channel < mat.channels());
     return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),
@@ -59,7 +62,10 @@ template<typename T>
 thrust::permutation_iterator<thrust::device_ptr<T>, thrust::transform_iterator<step_functor<T>, thrust::counting_iterator<int>>>  GpuMatEndItr(cv::cuda::GpuMat mat, int channel = 0)
 {
     if (channel == -1)
+    {
         mat = mat.reshape(1);
+        channel = 0;
+    }
     CV_Assert(mat.depth() == cv::DataType<T>::depth);
     CV_Assert(channel < mat.channels());
     return thrust::make_permutation_iterator(thrust::device_pointer_cast(mat.ptr<T>(0) + channel),