refactored and fixed bugs in gpu warp functions (remap, resize, warpAffine, warpPerspective)

wrote more complicated tests for them implemented own version of warpAffine and warpPerspective for different border interpolation types refactored some gpu tests
13 years ago · ade7394e77
parent 6e2507c197
commit ade7394e77
33 changed files with 6242 additions and 4544 deletions
--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@ -1251,7 +1251,7 @@ cv::GlFont::GlFont(const string& family, int height, Weight weight, Style style)
    base_ = glGenLists(256);
    CV_CheckGlError();

-    glFuncTab()->generateBitmapFont(family, height, weight, style & STYLE_ITALIC, style & STYLE_UNDERLINE, 0, 256, base_);
+    glFuncTab()->generateBitmapFont(family, height, weight, (style & STYLE_ITALIC) != 0, (style & STYLE_UNDERLINE) != 0, 0, 256, base_);
 #endif
 }

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -64,7 +64,7 @@ CV_EXPORTS int getCudaEnabledDeviceCount();
 CV_EXPORTS void setDevice(int device);
 CV_EXPORTS int getDevice();

-//! Explicitly destroys and cleans up all resources associated with the current device in the current process. 
+//! Explicitly destroys and cleans up all resources associated with the current device in the current process.
 //! Any subsequent API call to this device will reinitialize the device.
 CV_EXPORTS void resetDevice();

@ -81,7 +81,7 @@ enum FeatureSet
    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
 };

-// Gives information about what GPU archs this OpenCV GPU module was 
+// Gives information about what GPU archs this OpenCV GPU module was
 // compiled for
 class CV_EXPORTS TargetArchs
 {
@ -266,10 +266,10 @@ private:
    Impl *impl;

    friend struct StreamAccessor;
-    
+
    explicit Stream(Impl* impl);
 };
-        
+

 //////////////////////////////// Filter Engine ////////////////////////////////

@ -432,26 +432,26 @@ CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const
 CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());

 //! a synonym for normalized box filter
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) 
-{ 
-    boxFilter(src, dst, -1, ksize, anchor, stream); 
+static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+{
+    boxFilter(src, dst, -1, ksize, anchor, stream);
 }

 //! erodes the image (applies the local minimum operator)
 CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, 
-                      Point anchor = Point(-1, -1), int iterations = 1, 
+CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
+                      Point anchor = Point(-1, -1), int iterations = 1,
                      Stream& stream = Stream::Null());

 //! dilates the image (applies the local maximum operator)
 CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, 
-                       Point anchor = Point(-1, -1), int iterations = 1, 
+CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
+                       Point anchor = Point(-1, -1), int iterations = 1,
                       Stream& stream = Stream::Null());

 //! applies an advanced morphological operation to the image
 CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, 
+CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());

 //! applies non-separable 2D linear filter to the image
@ -461,7 +461,7 @@ CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat&
 CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, 
+                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
                            Stream& stream = Stream::Null());

 //! applies generalized Sobel operator to the image
@ -490,7 +490,7 @@ CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize
 ////////////////////////////// Arithmetics ///////////////////////////////////

 //! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, 
+CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());

 //! transposes the matrix
@ -572,7 +572,7 @@ CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scal
 CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null());

 //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
-CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, 
+CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
                            int dtype = -1, Stream& stream = Stream::Null());

 //! adds scaled array to another one (dst = alpha*src1 + src2)
@ -669,17 +669,17 @@ CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, i
 //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
 //! supports only CV_32FC1 map type
 CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
-                      int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar(), 
+                      int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
                      Stream& stream = Stream::Null());

 //! Does mean shift filtering on GPU.
 CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
-                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), 
+                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                                   Stream& stream = Stream::Null());

 //! Does mean shift procedure on GPU.
 CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
-                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), 
+                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                              Stream& stream = Stream::Null());

 //! Does mean shift segmentation with elimination of small regions.
@ -717,11 +717,17 @@ CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0,

 //! warps the image using affine transformation
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());
+CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+CV_EXPORTS void buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());

 //! warps the image using perspective transformation
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());
+CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+CV_EXPORTS void buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream = Stream::Null());

 //! builds plane warping maps
 CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale,
@ -738,11 +744,11 @@ CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K
 //! rotates an image around the origin (0,0) and then shifts it
 //! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 //! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
-CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
+CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

 //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, 
+CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType,
                               const Scalar& value = Scalar(), Stream& stream = Stream::Null());

 //! computes the integral image
@ -768,13 +774,13 @@ CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, co
 //! computes Harris cornerness criteria at each image pixel
 CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
 CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
-CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, 
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k,
                             int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());

 //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
 CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
 CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
-CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, 
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize,
    int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());

 //! performs per-element multiplication of two full (not packed) Fourier spectrums
@ -787,7 +793,7 @@ CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c

 //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
 //! Param dft_size is the size of DFT transform.
-//! 
+//!
 //! If the source matrix is not continous, then additional copy will be done,
 //! so to avoid copying ensure the source matrix is continous one. If you want to use
 //! preallocated output ensure it is continuous too, otherwise it will be reallocated.
@ -808,7 +814,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul
 struct CV_EXPORTS ConvolveBuf
 {
    ConvolveBuf() {}
-    ConvolveBuf(Size image_size, Size templ_size) 
+    ConvolveBuf(Size image_size, Size templ_size)
        { create(image_size, templ_size); }
    void create(Size image_size, Size templ_size);
    void create(Size image_size, Size templ_size, Size block_size);
@ -837,10 +843,10 @@ CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DE

 //! performs linear blending of two images
 //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
-CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
+CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
                            GpuMat& result, Stream& stream = Stream::Null());

-        
+
 struct CV_EXPORTS CannyBuf;

 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
@ -855,7 +861,7 @@ struct CV_EXPORTS CannyBuf
    CannyBuf(const GpuMat& dx_, const GpuMat& dy_);

    void create(const Size& image_size, int apperture_size = 3);
-    
+
    void release();

    GpuMat dx, dy;
@ -968,24 +974,24 @@ CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& t
                                GpuMat& dst, Stream& stream = Stream::Null());

 CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
-                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, 
+                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
                              Stream& stream = Stream::Null());

 CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
-                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100, 
+                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
                               std::vector<int>* inliers=NULL);

 //////////////////////////////// Image Labeling ////////////////////////////////

-//!performs labeling via graph cuts of a 2D regular 4-connected graph. 
-CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, 
+//!performs labeling via graph cuts of a 2D regular 4-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
                         GpuMat& buf, Stream& stream = Stream::Null());

-//!performs labeling via graph cuts of a 2D regular 8-connected graph. 
-CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
+//!performs labeling via graph cuts of a 2D regular 8-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
-                         GpuMat& labels, 
+                         GpuMat& labels,
                         GpuMat& buf, Stream& stream = Stream::Null());

 ////////////////////////////////// Histograms //////////////////////////////////
@ -1243,16 +1249,16 @@ struct CV_EXPORTS HOGDescriptor
    static vector<float> getPeopleDetector48x96();
    static vector<float> getPeopleDetector64x128();

-    void detect(const GpuMat& img, vector<Point>& found_locations, 
-                double hit_threshold=0, Size win_stride=Size(), 
+    void detect(const GpuMat& img, vector<Point>& found_locations,
+                double hit_threshold=0, Size win_stride=Size(),
                Size padding=Size());

    void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,
-                          double hit_threshold=0, Size win_stride=Size(), 
-                          Size padding=Size(), double scale0=1.05, 
+                          double hit_threshold=0, Size win_stride=Size(),
+                          Size padding=Size(), double scale0=1.05,
                          int group_threshold=2);

-    void getDescriptors(const GpuMat& img, Size win_stride, 
+    void getDescriptors(const GpuMat& img, Size win_stride,
                        GpuMat& descriptors,
                        int descr_format=DESCR_FORMAT_COL_BY_COL);

@ -1290,11 +1296,11 @@ protected:
    // Gradients conputation results
    GpuMat grad, qangle, grad_buf, qangle_buf;

-	// returns subbuffer with required size, reallocates buffer if nessesary.
-	static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
-	static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
+    // returns subbuffer with required size, reallocates buffer if nessesary.
+    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
+    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);

-	std::vector<GpuMat> image_scales;
+    std::vector<GpuMat> image_scales;
 };


@ -1323,8 +1329,8 @@ public:
    bool isMaskSupported() const;

    // Find one best match for each query descriptor
-    void matchSingle(const GpuMat& query, const GpuMat& train, 
-        GpuMat& trainIdx, GpuMat& distance, 
+    void matchSingle(const GpuMat& query, const GpuMat& train,
+        GpuMat& trainIdx, GpuMat& distance,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());

    // Download trainIdx and distance and convert it to CPU vector with DMatch
@ -1339,7 +1345,7 @@ public:
    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());

    // Find one best match from train collection for each query descriptor
-    void matchCollection(const GpuMat& query, const GpuMat& trainCollection, 
+    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());

@ -1508,7 +1514,7 @@ private:
 class CV_EXPORTS SURF_GPU : public CvSURFParams
 {
 public:
-    enum KeypointLayout 
+    enum KeypointLayout
    {
        SF_X = 0,
        SF_Y,
@ -1535,7 +1541,7 @@ public:

    //! download descriptors from device to host memory
    void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
-    
+
    //! finds the keypoints using fast hessian detector used in SURF
    //! supports CV_8UC1 images
    //! keypoints will have nFeature cols and 6 rows
@ -1546,16 +1552,16 @@ public:
    //! keypoints.ptr<float>(SF_DIR)[i] will contain orientation of i'th feature
    //! keypoints.ptr<float>(SF_HESSIAN)[i] will contain response of i'th feature
    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
-    //! finds the keypoints and computes their descriptors. 
+    //! finds the keypoints and computes their descriptors.
    //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, 
+    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
        bool useProvidedKeypoints = false);

    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, 
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
        bool useProvidedKeypoints = false);

-    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, 
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
        bool useProvidedKeypoints = false);

    void releaseMemory();
@ -1589,7 +1595,7 @@ public:

    //! finds the keypoints using FAST detector
    //! supports only CV_8UC1 images
-    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);    
+    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);

    //! download keypoints from device to host memory
@ -1709,7 +1715,7 @@ private:
    GpuMat pattern_;

    std::vector<GpuMat> imagePyr_;
-	std::vector<GpuMat> maskPyr_;
+    std::vector<GpuMat> maskPyr_;

    GpuMat buf_;

@ -1729,7 +1735,7 @@ class CV_EXPORTS BroxOpticalFlow
 {
 public:
    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
-        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), 
+        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
    {
    }
@ -1857,7 +1863,7 @@ private:
    GpuMat dy_calcBuf_;

    vector<GpuMat> prevPyr_;
-    vector<GpuMat> nextPyr_; 
+    vector<GpuMat> nextPyr_;

    GpuMat dx_buf_;
    GpuMat dy_buf_;
@ -1943,10 +1949,10 @@ private:
 //!            occlusion masks            0, occlusion masks            1,
 //!            interpolated forward flow  0, interpolated forward flow  1,
 //!            interpolated backward flow 0, interpolated backward flow 1
-//!            
-CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, 
+//!
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
                                  const GpuMat& fu, const GpuMat& fv,
-                                  const GpuMat& bu, const GpuMat& bv, 
+                                  const GpuMat& bu, const GpuMat& bv,
                                  float pos, GpuMat& newFrame, GpuMat& buf,
                                  Stream& stream = Stream::Null());

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -35,9 +35,9 @@ GPU_PERF_TEST(Remap, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolation
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1), 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
+                        testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC),
                        testing::Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_CONSTANT)));

@ -52,7 +52,7 @@ GPU_PERF_TEST_1(MeanShiftFiltering, cv::gpu::DeviceInfo)

    cv::Mat img = readImage("gpu/meanshift/cones.png");
    ASSERT_FALSE(img.empty());
-    
+
    cv::Mat rgba;
    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);

@ -80,7 +80,7 @@ GPU_PERF_TEST_1(MeanShiftProc, cv::gpu::DeviceInfo)

    cv::Mat img = readImage("gpu/meanshift/cones.png");
    ASSERT_FALSE(img.empty());
-    
+
    cv::Mat rgba;
    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);

@ -109,7 +109,7 @@ GPU_PERF_TEST_1(MeanShiftSegmentation, cv::gpu::DeviceInfo)

    cv::Mat img = readImage("gpu/meanshift/cones.png");
    ASSERT_FALSE(img.empty());
-    
+
    cv::Mat rgba;
    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);

@ -151,8 +151,8 @@ GPU_PERF_TEST(DrawColorDisp, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, DrawColorDisp, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_16SC1)));

 //////////////////////////////////////////////////////////////////////
@ -180,8 +180,8 @@ GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, ReprojectImageTo3D, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_16SC1)));

 //////////////////////////////////////////////////////////////////////
@ -210,12 +210,12 @@ GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, perf::MatType, CvtColorIn
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_16UC1, CV_32FC1),
                        testing::Values(
-                            CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA), CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY), CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA), 
-                            CvtColorInfo(4, 4, cv::COLOR_BGR2XYZ), CvtColorInfo(4, 4, cv::COLOR_BGR2YCrCb), CvtColorInfo(4, 4, cv::COLOR_YCrCb2BGR), 
+                            CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA), CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY), CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
+                            CvtColorInfo(4, 4, cv::COLOR_BGR2XYZ), CvtColorInfo(4, 4, cv::COLOR_BGR2YCrCb), CvtColorInfo(4, 4, cv::COLOR_YCrCb2BGR),
                            CvtColorInfo(4, 4, cv::COLOR_BGR2HSV), CvtColorInfo(4, 4, cv::COLOR_HSV2BGR))));

 //////////////////////////////////////////////////////////////////////
@ -269,8 +269,8 @@ GPU_PERF_TEST(Threshold, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Threshold, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_16UC1, CV_32FC1)));

 //////////////////////////////////////////////////////////////////////
@ -302,8 +302,8 @@ GPU_PERF_TEST(Resize, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolatio
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
-                        ALL_DEVICES, 
-                        testing::Values(perf::szSXGA, perf::sz1080p), 
+                        ALL_DEVICES,
+                        testing::Values(perf::szSXGA, perf::sz1080p),
                        testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC),
                        testing::Values(0.5, 2.0)));
@ -327,22 +327,21 @@ GPU_PERF_TEST(WarpAffine, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpol
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat dst;

-    double reflect[2][3] = { {-1,  0, 0},
-                             { 0, -1, 0}};
-    reflect[0][2] = size.width;
-    reflect[1][2] = size.height;
-    cv::Mat M(2, 3, CV_64F, (void*) reflect); 
+    const double aplha = CV_PI / 4;
+    double mat[2][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
+                         {std::sin(aplha),  std::cos(aplha), 0}};
+    cv::Mat M(2, 3, CV_64F, (void*) mat);

    TEST_CYCLE()
    {
-        cv::gpu::warpAffine(src, dst, M, size, interpolation);
+        cv::gpu::warpAffine(src, dst, M, size, interpolation, cv::BORDER_CONSTANT, cv::Scalar());
    }
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, WarpAffine, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
+                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC)));

 //////////////////////////////////////////////////////////////////////
@ -364,23 +363,22 @@ GPU_PERF_TEST(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Int
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat dst;

-    double reflect[3][3] = { {-1,  0, 0},
-                             { 0, -1, 0},
-                             { 0, 0, 1}};
-    reflect[0][2] = size.width;
-    reflect[1][2] = size.height;
-    cv::Mat M(3, 3, CV_64F, (void*)reflect); 
+    const double aplha = CV_PI / 4;
+    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
+                         {std::sin(aplha),  std::cos(aplha), 0},
+                         {0.0,              0.0,             1.0}};
+    cv::Mat M(3, 3, CV_64F, (void*) mat);

    TEST_CYCLE()
    {
-        cv::gpu::warpPerspective(src, dst, M, size, interpolation);
+        cv::gpu::warpPerspective(src, dst, M, size, interpolation, cv::BORDER_CONSTANT, cv::Scalar());
    }
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, WarpPerspective, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
+                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC)));

 //////////////////////////////////////////////////////////////////////
@ -398,13 +396,13 @@ GPU_PERF_TEST(BuildWarpPlaneMaps, cv::gpu::DeviceInfo, cv::Size)

    TEST_CYCLE()
    {
-        cv::gpu::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), cv::Mat::eye(3, 3, CV_32FC1), 
+        cv::gpu::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), cv::Mat::eye(3, 3, CV_32FC1),
                                    cv::Mat::ones(3, 3, CV_32FC1), cv::Mat::zeros(1, 3, CV_32F), 1.0, map_x, map_y);
    }
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, BuildWarpPlaneMaps, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -428,7 +426,7 @@ GPU_PERF_TEST(BuildWarpCylindricalMaps, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, BuildWarpCylindricalMaps, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -452,7 +450,7 @@ GPU_PERF_TEST(BuildWarpSphericalMaps, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, BuildWarpSphericalMaps, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -481,8 +479,8 @@ GPU_PERF_TEST(Rotate, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolatio
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Rotate, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC)));

@ -512,8 +510,8 @@ GPU_PERF_TEST(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Bord
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CopyMakeBorder, testing::Combine(
-                        ALL_DEVICES, 
-                        GPU_TYPICAL_MAT_SIZES, 
+                        ALL_DEVICES,
+                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
                        testing::Values((int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT, (int) cv::BORDER_WRAP, (int) cv::BORDER_CONSTANT)));

@ -542,7 +540,7 @@ GPU_PERF_TEST(Integral, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -569,7 +567,7 @@ GPU_PERF_TEST(IntegralSqr, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, IntegralSqr, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -596,7 +594,7 @@ GPU_PERF_TEST(ColumnSum, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, ColumnSum, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -608,7 +606,7 @@ GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, perf::MatType)
    int type = GET_PARAM(1);

    cv::gpu::setDevice(devInfo.deviceID());
-    
+
    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

@ -620,7 +618,7 @@ GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, perf::MatType)
    cv::gpu::GpuMat Dy;

    int blockSize = 3;
-    int ksize = 7;        
+    int ksize = 7;
    double k = 0.5;

    TEST_CYCLE()
@ -630,7 +628,7 @@ GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        testing::Values(CV_8UC1, CV_32FC1)));

 //////////////////////////////////////////////////////////////////////
@ -642,7 +640,7 @@ GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, perf::MatType)
    int type = GET_PARAM(1);

    cv::gpu::setDevice(devInfo.deviceID());
-    
+
    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

@ -654,7 +652,7 @@ GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, perf::MatType)
    cv::gpu::GpuMat Dy;

    int blockSize = 3;
-    int ksize = 7; 
+    int ksize = 7;

    TEST_CYCLE()
    {
@ -663,7 +661,7 @@ GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigenVal, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        testing::Values(CV_8UC1, CV_32FC1)));

 //////////////////////////////////////////////////////////////////////
@ -692,7 +690,7 @@ GPU_PERF_TEST(MulSpectrums, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, MulSpectrums, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -721,7 +719,7 @@ GPU_PERF_TEST(Dft, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Dft, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -754,7 +752,7 @@ GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, int, bool)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(3, 9, 27, 32, 64),
                        testing::Bool()));
@ -784,7 +782,7 @@ GPU_PERF_TEST(PyrDown, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1)));

@ -813,7 +811,7 @@ GPU_PERF_TEST(PyrUp, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1)));

@ -846,7 +844,7 @@ GPU_PERF_TEST(BlendLinear, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, BlendLinear, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_32FC1)));

@ -878,7 +876,7 @@ GPU_PERF_TEST(AlphaComp, cv::gpu::DeviceInfo, cv::Size, perf::MatType, AlphaOp)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, AlphaComp, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
                        testing::Values((int)cv::gpu::ALPHA_OVER, (int)cv::gpu::ALPHA_IN, (int)cv::gpu::ALPHA_OUT, (int)cv::gpu::ALPHA_ATOP, (int)cv::gpu::ALPHA_XOR, (int)cv::gpu::ALPHA_PLUS, (int)cv::gpu::ALPHA_OVER_PREMUL, (int)cv::gpu::ALPHA_IN_PREMUL, (int)cv::gpu::ALPHA_OUT_PREMUL, (int)cv::gpu::ALPHA_ATOP_PREMUL, (int)cv::gpu::ALPHA_XOR_PREMUL, (int)cv::gpu::ALPHA_PLUS_PREMUL, (int)cv::gpu::ALPHA_PREMUL)));
@ -932,7 +930,7 @@ GPU_PERF_TEST(CalcHist, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, CalcHist, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -961,7 +959,7 @@ GPU_PERF_TEST(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, EqualizeHist, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES));

 //////////////////////////////////////////////////////////////////////
@ -982,7 +980,7 @@ GPU_PERF_TEST(ImagePyramid_build, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
    cv::gpu::GpuMat src(src_host);

    cv::gpu::ImagePyramid pyr;
-    
+
    TEST_CYCLE()
    {
        pyr.build(src, 5);
@ -990,7 +988,7 @@ GPU_PERF_TEST(ImagePyramid_build, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, ImagePyramid_build, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)));

@ -1010,7 +1008,7 @@ GPU_PERF_TEST(ImagePyramid_getLayer, cv::gpu::DeviceInfo, cv::Size, perf::MatTyp
    cv::gpu::GpuMat dst;

    cv::gpu::ImagePyramid pyr(src, 3);
-    
+
    TEST_CYCLE()
    {
        pyr.getLayer(dst, cv::Size(size.width / 2 + 10, size.height / 2 + 10));
@ -1018,7 +1016,7 @@ GPU_PERF_TEST(ImagePyramid_getLayer, cv::gpu::DeviceInfo, cv::Size, perf::MatTyp
 }

 INSTANTIATE_TEST_CASE_P(ImgProc, ImagePyramid_getLayer, testing::Combine(
-                        ALL_DEVICES, 
+                        ALL_DEVICES,
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)));

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -47,10 +47,10 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
-    {    
+    namespace imgproc
+    {
        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
@ -67,11 +67,10 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
        {
-            static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, 
-                const float* borderValue, cudaStream_t stream, int)
+            static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
            {
                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-                
+
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

@ -86,11 +85,10 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, 
-                DevMem2D_<T> dst, const float* borderValue, int)
+            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, int)
            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-                
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

@ -189,7 +187,7 @@ namespace cv { namespace gpu { namespace device
        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        { 
+        {
            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, 
                DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
            {
@ -208,26 +206,26 @@ namespace cv { namespace gpu { namespace device

            static const caller_t callers[3][5] = 
            {
-                { 
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call, 
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call, 
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call 
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call
                },
-                { 
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
                },
-                { 
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+                {
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
                }
            };

--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -47,10 +47,10 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
-    {    
+    namespace imgproc
+    {
        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
        {
            const int x = blockDim.x * blockIdx.x + threadIdx.x;
@ -58,52 +58,25 @@ namespace cv { namespace gpu { namespace device

            if (x < dst.cols && y < dst.rows)
            {
-                const float xcoo = x / fx;
-                const float ycoo = y / fy;
+                const float xcoo = x * fx;
+                const float ycoo = y * fy;

-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-        template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x / fx;
-                const float ycoo = y / fy;
-
-                dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
+                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
            }
        }

        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
        {
            static void call(DevMem2D_<T> src, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
-            {            
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-                resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-        template <typename T> struct ResizeDispatcherStream<PointFilter, T>
-        {
-            static void call(DevMem2D_<T> src, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
-            {            
+            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                BrdReplicate<T> brd(src.rows, src.cols);
                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);

-                resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
+                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
                cudaSafeCall( cudaGetLastError() );
            }
        };
@ -111,31 +84,15 @@ namespace cv { namespace gpu { namespace device
        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
        {
            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst)
-            {            
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
-
-                resize<<<grid, block>>>(filter_src, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-        template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
-        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst)
-            {            
+            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                BrdReplicate<T> brd(src.rows, src.cols);
                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);

-                resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
+                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
@ -148,73 +105,61 @@ namespace cv { namespace gpu { namespace device
            { \
                typedef type elem_type; \
                typedef int index_type; \
-                int xoff, yoff; \
-                tex_resize_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                const int xoff; \
+                const int yoff; \
+                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
                { \
-                    return tex2D(tex_resize_ ## type , x + xoff, y + yoff); \
+                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
                } \
            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
            { \
                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_< type > dst) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type , srcWhole); \
-                    tex_resize_ ## type ##_reader texSrc(xoff, yoff); \
-                    BrdReplicate< type > brd(src.rows, src.cols); \
-                    BorderReader< tex_resize_ ## type ##_reader , BrdReplicate< type > > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_resize_ ## type ##_reader , BrdReplicate< type > > > filter_src(brdSrc); \
-                    resize<<<grid, block>>>(filter_src, fx, fy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <> struct ResizeDispatcherNonStream<PointFilter, type> \
-            { \
-                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type , srcWhole); \
-                    tex_resize_ ## type ##_reader texSrc(xoff, yoff); \
-                    BrdReplicate< type > brd(src.rows, src.cols); \
-                    BorderReader< tex_resize_ ## type ##_reader , BrdReplicate< type > > brdSrc(texSrc, brd); \
-                    resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            };
-            
+
        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)

        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)

        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)

        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)

        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)

        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)

        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX

        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        { 
+        {
            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream)
            {
                if (stream == 0)
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -0,0 +1,380 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(DevMem2Df xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+            
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(DevMem2D_<T> src, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_< type > dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(DevMem2D_< type > src, DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_< type > dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        { 
+            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <class Transform, typename T> 
+        void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*func_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff,
+                static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
+        }
+
+        template <typename T> void warpAffine_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpAffine_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template <typename T> void warpPerspective_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpPerspective_gpu<uchar >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<uchar2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<schar>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<ushort >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<ushort2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<short >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<short2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<int >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<float >(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -47,15 +47,11 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA)

-void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&, Stream&){ throw_nogpu(); }
 void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
-void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); }
-void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
-void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@ -105,64 +101,6 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_nogpu

 #else /* !defined (HAVE_CUDA) */

-////////////////////////////////////////////////////////////////////////
-// remap
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace imgproc 
-    {
-        template <typename T> 
-        void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, 
-                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    }
-}}}
-
-void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)
-{
-    using namespace ::cv::gpu::device::imgproc;
-
-    typedef void (*caller_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, 
-        int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-    static const caller_t callers[6][4] = 
-    {
-        {remap_gpu<uchar>, 0/*remap_gpu<uchar2>*/, remap_gpu<uchar3>, remap_gpu<uchar4>},
-        {0/*remap_gpu<schar>*/, 0/*remap_gpu<char2>*/, 0/*remap_gpu<char3>*/, 0/*remap_gpu<char4>*/},
-        {remap_gpu<ushort>, 0/*remap_gpu<ushort2>*/, remap_gpu<ushort3>, remap_gpu<ushort4>},
-        {remap_gpu<short>, 0/*remap_gpu<short2>*/, remap_gpu<short3>, remap_gpu<short4>},
-        {0/*remap_gpu<int>*/, 0/*remap_gpu<int2>*/, 0/*remap_gpu<int3>*/, 0/*remap_gpu<int4>*/},
-        {remap_gpu<float>, 0/*remap_gpu<float2>*/, remap_gpu<float3>, remap_gpu<float4>}
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
-
-    caller_t func = callers[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(xmap.size(), src.type());
-    
-    Scalar_<float> borderValueFloat;
-    borderValueFloat = borderValue;
-
-    DeviceInfo info;
-    int cc = info.majorVersion() * 10 + info.minorVersion();
-    
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap, 
-        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
-}
-
 ////////////////////////////////////////////////////////////////////////
 // meanShiftFiltering_GPU

@ -308,106 +246,6 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
    reprojectImageTo3D_callers[disp.type()](disp, xyzw, Q, StreamAccessor::getStream(stream));
 }

-////////////////////////////////////////////////////////////////////////
-// resize
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace imgproc 
-    {
-        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, 
-            DevMem2Db dst, int interpolation, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
-{
-    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
-    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
-    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
-
-    if( dsize == Size() )
-    {
-        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
-    }
-    else
-    {
-        fx = (double)dsize.width / src.cols;
-        fy = (double)dsize.height / src.rows;
-    }
-
-    dst.create(dsize, src.type());
-
-    if (dsize == src.size())
-    {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
-        return;
-    }
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    if ((src.type() == CV_8UC1 || src.type() == CV_8UC4) && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR))
-    {
-        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
-
-        NppiSize srcsz;
-        srcsz.width  = wholeSize.width;
-        srcsz.height = wholeSize.height;
-
-        NppiRect srcrect;
-        srcrect.x = ofs.x;
-        srcrect.y = ofs.y;
-        srcrect.width  = src.cols;
-        srcrect.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        if (src.type() == CV_8UC1)
-        {
-            nppSafeCall( nppiResize_8u_C1R(src.datastart, srcsz, static_cast<int>(src.step), srcrect,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
-        }
-        else
-        {
-            nppSafeCall( nppiResize_8u_C4R(src.datastart, srcsz, static_cast<int>(src.step), srcrect,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        using namespace ::cv::gpu::device::imgproc;
-
-        typedef void (*caller_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
-
-        static const caller_t callers[6][4] = 
-        {
-            {resize_gpu<uchar>, 0/*resize_gpu<uchar2>*/, resize_gpu<uchar3>, resize_gpu<uchar4>},
-            {0/*resize_gpu<schar>*/, 0/*resize_gpu<char2>*/, 0/*resize_gpu<char3>*/, 0/*resize_gpu<char4>*/},
-            {resize_gpu<ushort>, 0/*resize_gpu<ushort2>*/, resize_gpu<ushort3>, resize_gpu<ushort4>},
-            {resize_gpu<short>, 0/*resize_gpu<short2>*/, resize_gpu<short3>, resize_gpu<short4>},
-            {0/*resize_gpu<int>*/, 0/*resize_gpu<int2>*/, 0/*resize_gpu<int3>*/, 0/*resize_gpu<int4>*/},
-            {resize_gpu<float>, 0/*resize_gpu<float2>*/, resize_gpu<float3>, resize_gpu<float4>}
-        };
-
-        callers[src.depth()][src.channels() - 1](src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, 
-            static_cast<float>(fx), static_cast<float>(fy), dst, interpolation, stream);
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////
 // copyMakeBorder

@ -511,175 +349,6 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
    }
 }

-////////////////////////////////////////////////////////////////////////
-// warp
-
-namespace
-{
-    typedef NppStatus (*npp_warp_8u_t)(const Npp8u* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, Npp8u* pDst,
-                                       int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                       int interpolation);
-    typedef NppStatus (*npp_warp_16u_t)(const Npp16u* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, Npp16u* pDst,
-                                       int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                       int interpolation);
-    typedef NppStatus (*npp_warp_32s_t)(const Npp32s* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, Npp32s* pDst,
-                                       int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                       int interpolation);
-    typedef NppStatus (*npp_warp_32f_t)(const Npp32f* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, Npp32f* pDst,
-                                       int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                       int interpolation);
-
-    void nppWarpCaller(const GpuMat& src, GpuMat& dst, double coeffs[][3], const Size& dsize, int flags,
-                       npp_warp_8u_t npp_warp_8u[][2], npp_warp_16u_t npp_warp_16u[][2],
-                       npp_warp_32s_t npp_warp_32s[][2], npp_warp_32f_t npp_warp_32f[][2], cudaStream_t stream)
-    {
-        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
-
-        int interpolation = flags & INTER_MAX;
-
-        CV_Assert((src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F) && src.channels() != 2);
-        CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-
-        dst.create(dsize, src.type());
-    
-        Size wholeSize;
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-
-        NppiSize srcsz;
-        srcsz.height = wholeSize.height;
-        srcsz.width = wholeSize.width;
-
-        NppiRect srcroi;
-        srcroi.x = ofs.x;
-        srcroi.y = ofs.y;
-        srcroi.height = src.rows;
-        srcroi.width = src.cols;
-
-        NppiRect dstroi;
-        dstroi.x = dstroi.y = 0;
-        dstroi.height = dst.rows;
-        dstroi.width = dst.cols;
-
-        int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
-
-        NppStreamHandler h(stream);
-
-        switch (src.depth())
-        {
-        case CV_8U:
-            nppSafeCall( npp_warp_8u[src.channels()][warpInd]((Npp8u*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
-            break;
-        case CV_16U:
-            nppSafeCall( npp_warp_16u[src.channels()][warpInd]((Npp16u*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                dst.ptr<Npp16u>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
-            break;
-        case CV_32S:
-            nppSafeCall( npp_warp_32s[src.channels()][warpInd]((Npp32s*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
-            break;
-        case CV_32F:
-            nppSafeCall( npp_warp_32f[src.channels()][warpInd]((Npp32f*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
-            break;
-        default:
-            CV_Assert(!"Unsupported source type");
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s)
-{
-    static npp_warp_8u_t npp_warpAffine_8u[][2] =
-        {
-            {0, 0},
-            {nppiWarpAffine_8u_C1R, nppiWarpAffineBack_8u_C1R},
-            {0, 0},
-            {nppiWarpAffine_8u_C3R, nppiWarpAffineBack_8u_C3R},
-            {nppiWarpAffine_8u_C4R, nppiWarpAffineBack_8u_C4R}
-        };
-    static npp_warp_16u_t npp_warpAffine_16u[][2] =
-        {
-            {0, 0},
-            {nppiWarpAffine_16u_C1R, nppiWarpAffineBack_16u_C1R},
-            {0, 0},
-            {nppiWarpAffine_16u_C3R, nppiWarpAffineBack_16u_C3R},
-            {nppiWarpAffine_16u_C4R, nppiWarpAffineBack_16u_C4R}
-        };
-    static npp_warp_32s_t npp_warpAffine_32s[][2] =
-        {
-            {0, 0},
-            {nppiWarpAffine_32s_C1R, nppiWarpAffineBack_32s_C1R},
-            {0, 0},
-            {nppiWarpAffine_32s_C3R, nppiWarpAffineBack_32s_C3R},
-            {nppiWarpAffine_32s_C4R, nppiWarpAffineBack_32s_C4R}
-        };
-    static npp_warp_32f_t npp_warpAffine_32f[][2] =
-        {
-            {0, 0},
-            {nppiWarpAffine_32f_C1R, nppiWarpAffineBack_32f_C1R},
-            {0, 0},
-            {nppiWarpAffine_32f_C3R, nppiWarpAffineBack_32f_C3R},
-            {nppiWarpAffine_32f_C4R, nppiWarpAffineBack_32f_C4R}
-        };
-
-    CV_Assert(M.rows == 2 && M.cols == 3);
-
-    double coeffs[2][3];
-    Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
-    M.convertTo(coeffsMat, coeffsMat.type());
-
-    nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpAffine_8u, npp_warpAffine_16u, npp_warpAffine_32s, npp_warpAffine_32f, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s)
-{
-    static npp_warp_8u_t npp_warpPerspective_8u[][2] =
-        {
-            {0, 0},
-            {nppiWarpPerspective_8u_C1R, nppiWarpPerspectiveBack_8u_C1R},
-            {0, 0},
-            {nppiWarpPerspective_8u_C3R, nppiWarpPerspectiveBack_8u_C3R},
-            {nppiWarpPerspective_8u_C4R, nppiWarpPerspectiveBack_8u_C4R}
-        };
-    static npp_warp_16u_t npp_warpPerspective_16u[][2] =
-        {
-            {0, 0},
-            {nppiWarpPerspective_16u_C1R, nppiWarpPerspectiveBack_16u_C1R},
-            {0, 0},
-            {nppiWarpPerspective_16u_C3R, nppiWarpPerspectiveBack_16u_C3R},
-            {nppiWarpPerspective_16u_C4R, nppiWarpPerspectiveBack_16u_C4R}
-        };
-    static npp_warp_32s_t npp_warpPerspective_32s[][2] =
-        {
-            {0, 0},
-            {nppiWarpPerspective_32s_C1R, nppiWarpPerspectiveBack_32s_C1R},
-            {0, 0},
-            {nppiWarpPerspective_32s_C3R, nppiWarpPerspectiveBack_32s_C3R},
-            {nppiWarpPerspective_32s_C4R, nppiWarpPerspectiveBack_32s_C4R}
-        };
-    static npp_warp_32f_t npp_warpPerspective_32f[][2] =
-        {
-            {0, 0},
-            {nppiWarpPerspective_32f_C1R, nppiWarpPerspectiveBack_32f_C1R},
-            {0, 0},
-            {nppiWarpPerspective_32f_C3R, nppiWarpPerspectiveBack_32f_C3R},
-            {nppiWarpPerspective_32f_C4R, nppiWarpPerspectiveBack_32f_C4R}
-        };
-
-    CV_Assert(M.rows == 3 && M.cols == 3);
-
-    double coeffs[3][3];
-    Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
-    M.convertTo(coeffsMat, coeffsMat.type());
-
-    nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f, StreamAccessor::getStream(s));
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpPlaneMaps

--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@ -46,8 +46,9 @@
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
+#include "type_traits.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename Ptr2D> struct PointFilter
    {
@ -55,10 +56,10 @@ namespace cv { namespace gpu { namespace device
        typedef float index_type;

        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
-         
+
        __device__ __forceinline__ elem_type operator ()(float y, float x) const
        {
-            return src(__float2int_rn(y), __float2int_rn(x));
+            return src(__float2int_rd(y), __float2int_rd(x));
        }

        const Ptr2D src;
@ -77,6 +78,9 @@ namespace cv { namespace gpu { namespace device

            work_type out = VecTraits<work_type>::all(0);

+            x -= 0.5f;
+            y -= 0.5f;
+
            const int x1 = __float2int_rd(x);
            const int y1 = __float2int_rd(y);
            const int x2 = x1 + 1;
@ -107,8 +111,8 @@ namespace cv { namespace gpu { namespace device
        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;

        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
-        
-        static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
+
+        static __device__ __forceinline__ work_type cubicInterpolate(typename TypeTraits<work_type>::ParameterType p0, typename TypeTraits<work_type>::ParameterType p1, typename TypeTraits<work_type>::ParameterType p2, typename TypeTraits<work_type>::ParameterType p3, float x)
        {
            return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
        }
@ -117,15 +121,15 @@ namespace cv { namespace gpu { namespace device
        {
            const int xi = __float2int_rn(x);
            const int yi = __float2int_rn(y);
-            
+
            work_type arr[4];
-            
-            arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
-            arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
-            arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
-            arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
-            
-            return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
+
+            arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 2, xi - 2)), saturate_cast<work_type>(src(yi - 2, xi - 1)), saturate_cast<work_type>(src(yi - 2, xi)), saturate_cast<work_type>(src(yi - 2, xi + 1)), (x - xi + 2.0f) / 4.0f);
+            arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 2)), saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), (x - xi + 2.0f) / 4.0f);
+            arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 2)), saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), (x - xi + 2.0f) / 4.0f);
+            arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 2)), saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), (x - xi + 2.0f) / 4.0f);
+
+            return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], (y - yi + 2.0f) / 4.0f));
        }

        const Ptr2D src;
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@ -0,0 +1,105 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifndef HAVE_CUDA
+
+void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, Scalar, Stream&){ throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace device 
+{
+    namespace imgproc 
+    {
+        template <typename T> 
+        void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, 
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    }
+}}}
+
+void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
+{
+    using namespace cv::gpu::device::imgproc;
+
+    typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, DevMem2Db dst, int interpolation, 
+        int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+    static const func_t funcs[6][4] = 
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
+        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
+        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
+        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
+
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+    dst.create(xmap.size(), src.type());
+
+    Scalar_<float> borderValueFloat;
+    borderValueFloat = borderValue;
+
+    DeviceInfo info;
+    int cc = info.majorVersion() * 10 + info.minorVersion();
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap, 
+        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), cc);
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifndef HAVE_CUDA
+
+void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T>
+        void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
+                        DevMem2Db dst, int interpolation, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
+
+    if (dsize == Size())
+        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
+    else
+    {
+        fx = static_cast<double>(dsize.width) / src.cols;
+        fy = static_cast<double>(dsize.height) / src.rows;
+    }
+
+    dst.create(dsize, src.type());
+
+    if (dsize == src.size())
+    {
+        if (s)
+            s.enqueueCopy(src, dst);
+        else
+            src.copyTo(dst);
+        return;
+    }
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || src.type() == CV_8UC4);
+
+    if (useNpp)
+    {
+        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
+                                    double xFactor, double yFactor, int eInterpolation);
+
+        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
+
+        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
+
+        NppiSize srcsz;
+        srcsz.width  = wholeSize.width;
+        srcsz.height = wholeSize.height;
+
+        NppiRect srcrect;
+        srcrect.x = ofs.x;
+        srcrect.y = ofs.y;
+        srcrect.width  = src.cols;
+        srcrect.height = src.rows;
+
+        NppiSize dstsz;
+        dstsz.width  = dst.cols;
+        dstsz.height = dst.rows;
+
+        NppStreamHandler h(stream);
+
+        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else
+    {
+        using namespace ::cv::gpu::device::imgproc;
+
+        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+
+        static const func_t funcs[6][4] =
+        {
+            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
+            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
+            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
+            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
+            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
+            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
+            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
+    }
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@ -0,0 +1,463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifndef HAVE_CUDA
+
+void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_nogpu(); }
+void cv::gpu::buildWarpAffineMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_nogpu(); }
+void cv::gpu::buildWarpPerspectiveMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpAffine_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], DevMem2Df xmap, DevMem2Df ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpPerspective_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[3 * 3], DevMem2Db dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    }
+}}}
+
+void cv::gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
+{
+    using namespace cv::gpu::device::imgproc;
+
+    CV_Assert(M.rows == 2 && M.cols == 3);
+
+    xmap.create(dsize, CV_32FC1);
+    ymap.create(dsize, CV_32FC1);
+
+    float coeffs[2 * 3];
+    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invertAffineTransform(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
+{
+    using namespace cv::gpu::device::imgproc;
+
+    CV_Assert(M.rows == 3 && M.cols == 3);
+
+    xmap.create(dsize, CV_32FC1);
+    ymap.create(dsize, CV_32FC1);
+
+    float coeffs[3 * 3];
+    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invert(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
+
+    template <int DEPTH> struct NppWarpFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_t* pDst,
+                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
+                                    int interpolation);
+    };
+
+    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
+    {
+        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst,
+                         double coeffs[][3], cv::Size dsize, int interpolation, cudaStream_t stream)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            dst.create(dsize, src.type());
+            dst.setTo(cv::Scalar::all(0));
+
+            NppiSize srcsz;
+            srcsz.height = wholeSize.height;
+            srcsz.width = wholeSize.width;
+
+            NppiRect srcroi;
+            srcroi.x = ofs.x;
+            srcroi.y = ofs.y;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+
+            NppiRect dstroi;
+            dstroi.x = dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            cv::gpu::NppStreamHandler h(stream);
+
+            nppSafeCall( func((npp_t*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
+{
+    CV_Assert(M.rows == 2 && M.cols == 3);
+
+    int interpolation = flags & INTER_MAX;
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT;
+    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    #ifdef linux
+        // NPP bug on float data
+        useNpp = useNpp && src.depth() != CV_32F;
+    #endif
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
+            }
+        };
+
+        double coeffs[2][3];
+        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+    }
+    else
+    {
+        using namespace cv::gpu::device::imgproc;
+
+        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
+            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
+            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
+            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        int gpuBorderType;
+        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+        dst.create(dsize, src.type());
+
+        float coeffs[2 * 3];
+        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invertAffineTransform(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        DeviceInfo info;
+        int cc = info.majorVersion() * 10 + info.minorVersion();
+
+        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+    }
+}
+
+void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
+{
+    CV_Assert(M.rows == 3 && M.cols == 3);
+
+    int interpolation = flags & INTER_MAX;
+
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT;
+    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    #ifdef linux
+        // NPP bug on float data
+        useNpp = useNpp && src.depth() != CV_32F;
+    #endif
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
+            }
+        };
+
+        double coeffs[3][3];
+        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+    }
+    else
+    {
+        using namespace cv::gpu::device::imgproc;
+
+        typedef void (*func_t)(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float coeffs[2 * 3], DevMem2Db dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
+            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
+            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
+            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        int gpuBorderType;
+        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
+
+        dst.create(dsize, src.type());
+
+        float coeffs[3 * 3];
+        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invert(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        DeviceInfo info;
+        int cc = info.majorVersion() * 10 + info.minorVersion();
+
+        func(src, DevMem2Db(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+    }
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/interpolation.hpp
+++ b/modules/gpu/test/interpolation.hpp
@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
+#define __OPENCV_TEST_INTERPOLATION_HPP__
+
+template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+{
+    if (border_type == cv::BORDER_CONSTANT)
+        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
+
+    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
+}
+
+template <typename T> struct NearestInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
+    }
+};
+
+template <typename T> struct LinearInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        x -= 0.5f;
+        y -= 0.5f;
+
+        int x1 = cvFloor(x);
+        int y1 = cvFloor(y);
+        int x2 = x1 + 1;
+        int y2 = y1 + 1;
+
+        float res = 0;
+
+        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
+        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
+        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
+        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+template <typename T> struct CubicInterpolator
+{
+    static float getValue(float p[4], float x)
+    {
+        return p[1] + 0.5 * x * (p[2] - p[0] + x*(2.0*p[0] - 5.0*p[1] + 4.0*p[2] - p[3] + x*(3.0*(p[1] - p[2]) + p[3] - p[0])));
+    }
+
+    static float getValue(float p[4][4], float x, float y)
+    {
+        float arr[4];
+
+        arr[0] = getValue(p[0], x);
+        arr[1] = getValue(p[1], x);
+        arr[2] = getValue(p[2], x);
+        arr[3] = getValue(p[3], x);
+
+        return getValue(arr, y);
+    }
+
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        int ix = cvRound(x);
+        int iy = cvRound(y);
+
+        float vals[4][4] =
+        {
+            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
+        };
+
+        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
+    }
+};
+
+#endif // __OPENCV_TEST_INTERPOLATION_HPP__
--- a/modules/gpu/test/test_main.cpp
+++ b/modules/gpu/test/test_main.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_precomp.cpp
+++ b/modules/gpu/test/test_precomp.cpp
@ -39,4 +39,4 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@ -47,10 +47,11 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
-#include <limits>
 #include <string>
+#include <limits>
 #include <algorithm>
 #include <iterator>
+
 #include "cvconfig.h"
 #include "opencv2/core/core.hpp"
 #include "opencv2/highgui/highgui.hpp"
@ -60,6 +61,8 @@
 #include "opencv2/ts/ts.hpp"
 #include "opencv2/ts/ts_perf.hpp"
 #include "opencv2/gpu/gpu.hpp"
-#include "test_gpu_base.hpp"
+
+#include "utility.hpp"
+#include "interpolation.hpp"

 #endif
--- a/modules/gpu/test/test_arithm.cpp
+++ b/modules/gpu/test/test_arithm.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

@ -117,7 +117,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // subtract
@ -160,7 +160,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Subtract, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // multiply
@ -203,7 +203,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Multiply, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // divide
@ -246,7 +246,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Divide, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // transpose
@ -272,7 +272,7 @@ TEST_P(Transpose, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC4, CV_8SC1, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_32FC1, CV_32FC2, CV_64FC1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // absdiff
@ -314,7 +314,7 @@ TEST_P(Absdiff, Scalar)
 INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // abs
@ -339,7 +339,7 @@ TEST_P(Abs, Array)
 INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine(
                        ALL_DEVICES,
                        Values(CV_16SC1, CV_32FC1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // Sqr
@ -365,7 +365,7 @@ TEST_P(Sqr, Array)
 INSTANTIATE_TEST_CASE_P(Arithm, Sqr, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
@ -391,7 +391,7 @@ TEST_P(Sqrt, Array)
 INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(
                        ALL_DEVICES,
                        Values(MatType(CV_32FC1)),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // compare
@ -445,7 +445,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_16UC1, CV_32SC1),
                        Values((int) cv::CMP_EQ, (int) cv::CMP_GT, (int) cv::CMP_GE, (int) cv::CMP_LT, (int) cv::CMP_LE, (int) cv::CMP_NE),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // meanStdDev
@ -498,7 +498,7 @@ TEST_P(MeanStdDev, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, MeanStdDev, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // normDiff
@ -543,7 +543,7 @@ TEST_P(NormDiff, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, NormDiff, Combine(
                        ALL_DEVICES,
                        Values((int) cv::NORM_INF, (int) cv::NORM_L1, (int) cv::NORM_L2),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // flip
@ -596,7 +596,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        Values((int)FLIP_BOTH, (int)FLIP_X, (int)FLIP_Y),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // LUT
@ -648,7 +648,7 @@ TEST_P(LUT, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, LUT, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // exp
@ -695,7 +695,7 @@ TEST_P(Exp, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // pow
@ -754,7 +754,7 @@ TEST_P(Pow, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(
                        ALL_DEVICES,
                        Values(CV_32F, CV_32FC3),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // log
@ -801,7 +801,7 @@ TEST_P(Log, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // magnitude
@ -849,7 +849,7 @@ TEST_P(Magnitude, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // phase
@ -897,7 +897,7 @@ TEST_P(Phase, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // cartToPolar
@ -949,7 +949,7 @@ TEST_P(CartToPolar, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // polarToCart
@ -1002,7 +1002,7 @@ TEST_P(PolarToCart, Accuracy)

 INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // minMax
@ -1078,7 +1078,7 @@ TEST_P(MinMax, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
                        ALL_DEVICES,
                        Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // minMaxLoc
@ -1167,7 +1167,7 @@ TEST_P(MinMaxLoc, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
                        ALL_DEVICES,
                        Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////
 // countNonZero
@ -1215,7 +1215,7 @@ TEST_P(CountNonZero, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
                        ALL_DEVICES,
                        Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 //////////////////////////////////////////////////////////////////////////////
 // sum
@ -1295,7 +1295,7 @@ TEST_P(Sum, Sqr)
 INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
                        ALL_DEVICES,
                        Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 //////////////////////////////////////////////////////////////////////////////
 // bitwise
@ -1560,7 +1560,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
                        TYPES(CV_8U, CV_64F, 1, 1),
                        TYPES(CV_8U, CV_64F, 1, 1),
                        TYPES(CV_8U, CV_64F, 1, 1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 //////////////////////////////////////////////////////////////////////////////
 // reduce
@ -1624,7 +1624,7 @@ INSTANTIATE_TEST_CASE_P(Arithm, Reduce, Combine(
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        Values(0, 1),
                        Values((int)CV_REDUCE_SUM, (int)CV_REDUCE_AVG, (int)CV_REDUCE_MAX, (int)CV_REDUCE_MIN),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 //////////////////////////////////////////////////////////////////////////////
 // gemm
@ -1685,6 +1685,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, GEMM, Combine(
                        ALL_DEVICES,
                        Values(CV_32FC1, CV_32FC2),
                        Values(0, (int) cv::GEMM_1_T, (int) cv::GEMM_2_T, (int) cv::GEMM_3_T),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+PARAM_TEST_CASE(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, int, Border, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int border;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        border = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(CopyMakeBorder, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0, 255);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(size.width + 2 * border, size.height + 2 * border), type, useRoi);
+    cv::gpu::copyMakeBorder(loadMat(src, useRoi), dst, border, border, border, border, borderType, val);
+
+    cv::Mat dst_gold;
+    cv::copyMakeBorder(src, dst_gold, border, border, border, border, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
+    ALL_DEVICES, 
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(1, 10, 50),
+    testing::Values(Border(cv::BORDER_REFLECT101), Border(cv::BORDER_REPLICATE), Border(cv::BORDER_CONSTANT), Border(cv::BORDER_REFLECT), Border(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

@ -132,7 +132,7 @@ TEST_P(Blur, Gray)
 INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
                        ALL_DEVICES, 
                        Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // sobel
@ -212,7 +212,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(
                        Values(3, 5, 7), 
                        Values(0, 1, 2),
                        Values(0, 1, 2),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // scharr
@ -289,7 +289,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
                        ALL_DEVICES, 
                        Values(0, 1),
                        Values(0, 1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // gaussianBlur
@ -361,7 +361,7 @@ TEST_P(GaussianBlur, Gray)
 INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
                        ALL_DEVICES, 
                        Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7), cv::Size(9, 9), cv::Size(11, 11), cv::Size(13, 13), cv::Size(15, 15), cv::Size(17, 17), cv::Size(19, 19), cv::Size(21, 21), cv::Size(23, 23), cv::Size(25, 25), cv::Size(27, 27), cv::Size(29, 29), cv::Size(31, 31)),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // laplacian
@ -426,7 +426,7 @@ TEST_P(Laplacian, Gray)
 INSTANTIATE_TEST_CASE_P(Filter, Laplacian, Combine(
                        ALL_DEVICES,
                        Values(1, 3),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // erode
@ -492,7 +492,7 @@ TEST_P(Erode, Gray)

 INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // dilate
@ -558,7 +558,7 @@ TEST_P(Dilate, Gray)

 INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
                        ALL_DEVICES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // morphEx
@ -627,7 +627,7 @@ TEST_P(MorphEx, Gray)
 INSTANTIATE_TEST_CASE_P(Filter, MorphEx, Combine(
                        ALL_DEVICES,
                        Values((int)cv::MORPH_OPEN, (int)cv::MORPH_CLOSE, (int)cv::MORPH_GRADIENT, (int)cv::MORPH_TOPHAT, (int)cv::MORPH_BLACKHAT),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // filter2D
@ -717,6 +717,6 @@ TEST_P(Filter2D, 32FC1)
 INSTANTIATE_TEST_CASE_P(Filter, Filter2D, Combine(
                        ALL_DEVICES,
                        Values(3, 5, 7, 11, 13, 15),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_hog.cpp
+++ b/modules/gpu/test/test_hog.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
--- a/modules/gpu/test/test_matop.cpp
+++ b/modules/gpu/test/test_matop.cpp
@ -40,7 +40,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

@ -106,7 +106,7 @@ TEST_P(Merge, Accuracy)
 INSTANTIATE_TEST_CASE_P(MatOp, Merge, Combine(
                        ALL_DEVICES, 
                        ALL_TYPES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // split
@ -167,7 +167,7 @@ TEST_P(Split, Accuracy)
 INSTANTIATE_TEST_CASE_P(MatOp, Split, Combine(
                        ALL_DEVICES, 
                        ALL_TYPES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // split_merge_consistency
@ -328,7 +328,7 @@ TEST_P(SetTo, Masked)
 INSTANTIATE_TEST_CASE_P(MatOp, SetTo, Combine(
                        ALL_DEVICES, 
                        ALL_TYPES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // copyTo
@ -407,7 +407,7 @@ TEST_P(CopyTo, Masked)
 INSTANTIATE_TEST_CASE_P(MatOp, CopyTo, Combine(
                        ALL_DEVICES, 
                        ALL_TYPES,
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // convertTo
@ -491,7 +491,7 @@ INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, Combine(
                        ALL_DEVICES, 
                        TYPES(CV_8U, CV_64F, 1, 1),
                        TYPES(CV_8U, CV_64F, 1, 1),
-                        USE_ROI));
+                        WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
 // async
--- a/modules/gpu/test/test_nvidia.cpp
+++ b/modules/gpu/test/test_nvidia.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void remapImpl(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize = xmap.size();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ymap.at<float>(y, x), xmap.at<float>(y, x), c, borderType, borderVal);
+            }
+        }
+    }
+
+    void remapGold(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] = 
+        {
+            remapImpl<unsigned char, NearestInterpolator>,
+            remapImpl<signed char, NearestInterpolator>,
+            remapImpl<unsigned short, NearestInterpolator>,
+            remapImpl<short, NearestInterpolator>,
+            remapImpl<int, NearestInterpolator>,
+            remapImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] = 
+        {
+            remapImpl<unsigned char, LinearInterpolator>,
+            remapImpl<signed char, LinearInterpolator>,
+            remapImpl<unsigned short, LinearInterpolator>,
+            remapImpl<short, LinearInterpolator>,
+            remapImpl<int, LinearInterpolator>,
+            remapImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] = 
+        {
+            remapImpl<unsigned char, CubicInterpolator>,
+            remapImpl<signed char, CubicInterpolator>,
+            remapImpl<unsigned short, CubicInterpolator>,
+            remapImpl<short, CubicInterpolator>,
+            remapImpl<int, CubicInterpolator>,
+            remapImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, xmap, ymap, dst, borderType, borderVal);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, Border, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    cv::Mat xmap;
+    cv::Mat ymap;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        // rotation matrix
+
+        const double aplha = CV_PI / 4;
+        static double M[2][3] = { {std::cos(aplha), -std::sin(aplha), size.width / 2.0},
+                                  {std::sin(aplha),  std::cos(aplha), 0.0}};
+
+        xmap.create(size, CV_32FC1);
+        ymap.create(size, CV_32FC1);
+
+        for (int y = 0; y < size.height; ++y)
+        {
+            for (int x = 0; x < size.width; ++x)
+            {
+                xmap.at<float>(y, x) = static_cast<float>(M[0][0] * x + M[0][1] * y + M[0][2]);
+                ymap.at<float>(y, x) = static_cast<float>(M[1][0] * x + M[1][1] * y + M[1][2]);
+            }
+        }
+    }
+};
+
+TEST_P(Remap, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(xmap.size(), type, useRoi);
+    cv::gpu::remap(loadMat(src, useRoi), dst, loadMat(xmap, useRoi), loadMat(ymap, useRoi), interpolation, borderType, val);
+
+    cv::Mat dst_gold;
+    remapGold(src, xmap, ymap, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(Border(cv::BORDER_REFLECT101), Border(cv::BORDER_REPLICATE), Border(cv::BORDER_CONSTANT), Border(cv::BORDER_REFLECT), Border(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@ -0,0 +1,202 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void resizeImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
+
+        dst.create(dsize, src.type());
+
+        float ifx = static_cast<float>(1.0 / fx);
+        float ify = static_cast<float>(1.0 / fy);
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
+            }
+        }
+    }
+
+    void resizeGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy, int interpolation)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
+
+        static const func_t nearest_funcs[] =
+        {
+            resizeImpl<unsigned char, NearestInterpolator>,
+            resizeImpl<signed char, NearestInterpolator>,
+            resizeImpl<unsigned short, NearestInterpolator>,
+            resizeImpl<short, NearestInterpolator>,
+            resizeImpl<int, NearestInterpolator>,
+            resizeImpl<float, NearestInterpolator>
+        };
+
+
+        static const func_t linear_funcs[] =
+        {
+            resizeImpl<unsigned char, LinearInterpolator>,
+            resizeImpl<signed char, LinearInterpolator>,
+            resizeImpl<unsigned short, LinearInterpolator>,
+            resizeImpl<short, LinearInterpolator>,
+            resizeImpl<int, LinearInterpolator>,
+            resizeImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            resizeImpl<unsigned char, CubicInterpolator>,
+            resizeImpl<signed char, CubicInterpolator>,
+            resizeImpl<unsigned short, CubicInterpolator>,
+            resizeImpl<short, CubicInterpolator>,
+            resizeImpl<int, CubicInterpolator>,
+            resizeImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, dst, fx, fy);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int interpolation;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Resize, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::gpu::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    resizeGold(src, dst_gold, coeff, coeff, interpolation);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(ResizeNPP, cv::gpu::DeviceInfo, MatType, double, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    double coeff;
+    int interpolation;
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        coeff = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(ResizeNPP, Accuracy)
+{
+    if (type == CV_8UC1 && interpolation == cv::INTER_CUBIC)
+        return;
+
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::resize(loadMat(src), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    resizeGold(src, dst_gold, coeff, coeff, interpolation);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        threshOp = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Threshold, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV)),
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(2, 3, CV_64FC1);
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpAffineMaps
+
+PARAM_TEST_CASE(BuildWarpAffineMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(BuildWarpAffineMaps, Accuracy)
+{
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::gpu::GpuMat xmap, ymap;
+    cv::gpu::buildWarpAffineMaps(M, inverse, size, xmap, ymap);
+
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Mat dst_gold;
+    cv::warpAffine(src, dst_gold, M, size, flags, borderMode);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpAffineMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpAffineImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float xcoo = static_cast<float>(M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2));
+                float ycoo = static_cast<float>(M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2));
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpAffineGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpAffineImpl<unsigned char, NearestInterpolator>,
+            warpAffineImpl<signed char, NearestInterpolator>,
+            warpAffineImpl<unsigned short, NearestInterpolator>,
+            warpAffineImpl<short, NearestInterpolator>,
+            warpAffineImpl<int, NearestInterpolator>,
+            warpAffineImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpAffineImpl<unsigned char, LinearInterpolator>,
+            warpAffineImpl<signed char, LinearInterpolator>,
+            warpAffineImpl<unsigned short, LinearInterpolator>,
+            warpAffineImpl<short, LinearInterpolator>,
+            warpAffineImpl<int, LinearInterpolator>,
+            warpAffineImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpAffineImpl<unsigned char, CubicInterpolator>,
+            warpAffineImpl<signed char, CubicInterpolator>,
+            warpAffineImpl<unsigned short, CubicInterpolator>,
+            warpAffineImpl<short, CubicInterpolator>,
+            warpAffineImpl<int, CubicInterpolator>,
+            warpAffineImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invertAffineTransform(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, Border, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    cv::Mat M;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(WarpAffine, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::warpAffine(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(Border(cv::BORDER_REFLECT101), Border(cv::BORDER_REPLICATE), Border(cv::BORDER_REFLECT), Border(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(WarpAffineNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::warpAffine(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(3, 3, CV_64FC1);
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+        M.at<double>(2, 0) = 0.0            ; M.at<double>(2, 1) =  0.0            ; M.at<double>(2, 2) = 1.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpPerspectiveMaps
+
+PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::gpu::DeviceInfo, cv::Size, Inverse)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(BuildWarpPerspectiveMaps, Accuracy)
+{
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::gpu::GpuMat xmap, ymap;
+    cv::gpu::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
+
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), cv::INTER_NEAREST, cv::BORDER_CONSTANT);
+
+    int flags = cv::INTER_NEAREST;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Mat dst_gold;
+    cv::warpPerspective(src, dst_gold, M, size, flags, cv::BORDER_CONSTANT);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BuildWarpPerspectiveMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpPerspectiveImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float coeff = static_cast<float>(M.at<double>(2, 0) * x + M.at<double>(2, 1) * y + M.at<double>(2, 2));
+
+                float xcoo = static_cast<float>((M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2)) / coeff);
+                float ycoo = static_cast<float>((M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2)) / coeff);
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpPerspectiveGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, NearestInterpolator>,
+            warpPerspectiveImpl<signed char, NearestInterpolator>,
+            warpPerspectiveImpl<unsigned short, NearestInterpolator>,
+            warpPerspectiveImpl<short, NearestInterpolator>,
+            warpPerspectiveImpl<int, NearestInterpolator>,
+            warpPerspectiveImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, LinearInterpolator>,
+            warpPerspectiveImpl<signed char, LinearInterpolator>,
+            warpPerspectiveImpl<unsigned short, LinearInterpolator>,
+            warpPerspectiveImpl<short, LinearInterpolator>,
+            warpPerspectiveImpl<int, LinearInterpolator>,
+            warpPerspectiveImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, CubicInterpolator>,
+            warpPerspectiveImpl<signed char, CubicInterpolator>,
+            warpPerspectiveImpl<unsigned short, CubicInterpolator>,
+            warpPerspectiveImpl<short, CubicInterpolator>,
+            warpPerspectiveImpl<int, CubicInterpolator>,
+            warpPerspectiveImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invert(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, Border, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    cv::Mat M;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(WarpPerspective, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::warpPerspective(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(Border(cv::BORDER_REFLECT101), Border(cv::BORDER_REPLICATE), Border(cv::BORDER_REFLECT), Border(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(WarpPerspectiveNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::warpPerspective(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_gpu_base.cpp
+++ b/modules/gpu/test/test_gpu_base.cpp
@ -39,36 +39,94 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "precomp.hpp"

 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 using namespace cvtest;

-GpuMat loadMat(const Mat& m, bool useRoi)
+int randomInt(int minVal, int maxVal)
+{
+    RNG& rng = TS::ptr()->get_rng();
+    return rng.uniform(minVal, maxVal);
+}
+
+double randomDouble(double minVal, double maxVal)
+{
+    RNG& rng = TS::ptr()->get_rng();
+    return rng.uniform(minVal, maxVal);
+}
+
+Size randomSize(int minVal, int maxVal)
+{
+    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
+}
+
+Scalar randomScalar(double minVal, double maxVal)
+{
+    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
+}
+
+Mat randomMat(Size size, int type, double minVal, double maxVal)
+{
+    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
+}
+
+cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi)
 {
-    Size size = m.size();
    Size size0 = size;

    if (useRoi)
    {
-        RNG& rng = TS::ptr()->get_rng();
-
-        size0.width += rng.uniform(5, 15);
-        size0.height += rng.uniform(5, 15);
+        size0.width += randomInt(5, 15);
+        size0.height += randomInt(5, 15);
    }
        
-    GpuMat d_m(size0, m.type());
+    GpuMat d_m(size0, type);
    
    if (size0 != size)
        d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));

-    d_m.upload(m);
+    return d_m;
+}

+GpuMat loadMat(const Mat& m, bool useRoi)
+{
+    GpuMat d_m = createMat(m.size(), m.type(), useRoi);
+    d_m.upload(m);
    return d_m;
 }

+void showDiff(InputArray gold_, InputArray actual_, double eps)
+{
+    Mat gold;
+    if (gold_.kind() == _InputArray::MAT)
+        gold = gold_.getMat();
+    else
+        gold_.getGpuMat().download(gold);
+
+    Mat actual;
+    if (actual_.kind() == _InputArray::MAT)
+        actual = actual_.getMat();
+    else
+        actual_.getGpuMat().download(actual);
+
+    Mat diff;
+    absdiff(gold, actual, diff);
+    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+
+    namedWindow("gold", WINDOW_NORMAL);
+    namedWindow("actual", WINDOW_NORMAL);
+    namedWindow("diff", WINDOW_NORMAL);
+
+    imshow("gold", gold);
+    imshow("actual", actual);
+    imshow("diff", diff);
+
+    waitKey();
+}
+
 bool supportFeature(const DeviceInfo& info, FeatureSet feature)
 {
    return TargetArchs::builtWith(feature) && info.supports(feature);
@ -149,6 +207,24 @@ Mat readImage(const string& fileName, int flags)
    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
 }

+Mat readImageType(const string& fname, int type)
+{
+    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
+    if (CV_MAT_CN(type) == 4)
+    {
+        Mat temp;
+        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
+        swap(src, temp);
+    }
+    src.convertTo(src, CV_MAT_DEPTH(type));
+    return src;
+}
+
+double checkNorm(const Mat& m)
+{
+    return norm(m, NORM_INF);
+}
+
 double checkNorm(const Mat& m1, const Mat& m2)
 {
    return norm(m1, m2, NORM_INF);
@ -173,3 +249,11 @@ void PrintTo(const UseRoi& useRoi, std::ostream* os)
    else
        (*os) << "whole matrix";
 }
+
+void PrintTo(const Inverse& inverse, std::ostream* os)
+{
+    if (inverse)
+        (*os) << "inverse";
+    else
+        (*os) << "direct";
+}
--- a/modules/gpu/test/test_gpu_base.hpp
+++ b/modules/gpu/test/test_gpu_base.hpp
@ -39,11 +39,20 @@
 //
 //M*/

-#ifndef __OPENCV_TEST_GPU_BASE_HPP__
-#define __OPENCV_TEST_GPU_BASE_HPP__
+#ifndef __OPENCV_TEST_UTILITY_HPP__
+#define __OPENCV_TEST_UTILITY_HPP__

+int randomInt(int minVal, int maxVal);
+double randomDouble(double minVal, double maxVal);
+cv::Size randomSize(int minVal, int maxVal);
+cv::Scalar randomScalar(double minVal, double maxVal);
+cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
+
+cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi = false);
 cv::gpu::GpuMat loadMat(const cv::Mat& m, bool useRoi = false);

+void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
+
 //! return true if device supports specified feature and gpu module was built with support the feature.
 bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);

@ -54,22 +63,29 @@ std::vector<cv::gpu::DeviceInfo> devices(cv::gpu::FeatureSet feature);

 //! read image from testdata folder.
 cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+cv::Mat readImageType(const std::string& fname, int type);

+double checkNorm(const cv::Mat& m);
 double checkNorm(const cv::Mat& m1, const cv::Mat& m2);
 double checkSimilarity(const cv::Mat& m1, const cv::Mat& m2);

+#define EXPECT_MAT_NORM(mat, eps) \
+    { \
+        EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
+    }
+
 #define EXPECT_MAT_NEAR(mat1, mat2, eps) \
    { \
        ASSERT_EQ(mat1.type(), mat2.type()); \
        ASSERT_EQ(mat1.size(), mat2.size()); \
-        EXPECT_LE(checkNorm(mat1, mat2), eps); \
+        EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
    }

 #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
    { \
        ASSERT_EQ(mat1.type(), mat2.type()); \
        ASSERT_EQ(mat1.size(), mat2.size()); \
-        EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
+        EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
    }

 namespace cv { namespace gpu 
@ -99,6 +115,19 @@ private:

 void PrintTo(const UseRoi& useRoi, std::ostream* os);

+class Inverse
+{
+public:
+    inline Inverse(bool val = false) : val_(val) {}
+
+    inline operator bool() const { return val_; }
+
+private:
+    bool val_;
+};
+
+void PrintTo(const Inverse& useRoi, std::ostream* os);
+
 CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)

 CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
@ -127,11 +156,19 @@ CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::T
 CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)

 #define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+
 #define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
 #define ALL_DEVICES testing::ValuesIn(devices())
 #define DEVICES(feature) testing::ValuesIn(devices(feature))
+
 #define ALL_TYPES testing::ValuesIn(all_types())
 #define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-#define USE_ROI testing::Values(false, true)

-#endif // __OPENCV_TEST_GPU_BASE_HPP__
+#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
+
+#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
+
+#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
+
+#endif // __OPENCV_TEST_UTILITY_HPP__