diff --git a/apps/sft/CMakeLists.txt b/apps/sft/CMakeLists.txt
index 8b950225c8..c7bd187a28 100644
--- a/apps/sft/CMakeLists.txt
+++ b/apps/sft/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(name sft)
 set(the_target opencv_${name})
 
-set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml)
+set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml OPTIONAL opencv_gpu opencv_objdetect opencv_featurest2d)
 ocv_check_dependencies(${OPENCV_${the_target}_DEPS})
 
 if(NOT OCV_DEPENDENCIES_FOUND)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index d82211bf38..ee42816dbd 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -7,12 +7,11 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -23,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other GpuMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
deleted file mode 100644
index 8ed93651ad..0000000000
--- a/modules/gpu/src/gpu_init.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace gpu
-{
-
-CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale", obj.minScale);
-                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
-                  obj.info()->addParam(obj, "scales",   obj.scales));
-
-bool initModule_gpu(void)
-{
-    Ptr<Algorithm> sc = createSCascade();
-    return sc->info() != 0;
-}
-
-} }
\ No newline at end of file
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index fb48814cf5..f19241a4a2 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -1,3 +1,50 @@
+macro(ocv_glob_cuda_powered_module_sources)
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
+  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
+  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+
+  file(GLOB_RECURSE lib_device_srcs "src/*.cu")
+  set(device_objs "")
+  set(lib_device_hdrs "")
+
+  if (HAVE_CUDA AND lib_device_srcs)
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
+
+  ocv_cuda_compile(device_objs ${lib_device_srcs})
+  source_group("Src\\Cuda" FILES ${lib_device_srcs} ${lib_device_hdrs})
+  if (lib_device_hdrs)
+  list(REMOVE_ITEM lib_int_hdrs ${lib_device_hdrs})
+  endif()
+  endif()
+
+  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+  SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs})
+
+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+  source_group("Include" FILES ${lib_hdrs})
+  source_group("Include\\detail" FILES ${lib_hdrs_detail})
+endmacro()
+
 set(the_description "Soft Cascade detection and training")
-ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310)
\ No newline at end of file
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wmissing-declarations)
+set(cuda_deps "")
+set(cuda_include "")
+
+if (NAVE_CUDA)
+  set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
+ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_gpu ${cuda_deps})
+
+if(HAVE_CUDA)
+  ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+
+ocv_glob_cuda_powered_module_sources()
+ocv_create_module()
+ocv_add_precompiled_headers(${the_module})
+
+ocv_add_accuracy_tests()
+ocv_add_perf_tests()
diff --git a/modules/softcascade/include/opencv2/softcascade.hpp b/modules/softcascade/include/opencv2/softcascade.hpp
index e0dbdf4500..e97ac4d786 100644
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -212,6 +212,96 @@ public:
 
 CV_EXPORTS bool initModule_softcascade(void);
 
+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS ChannelsProcessor
+{
+public:
+    enum
+    {
+        GENERIC   = 1 << 4,
+        SEPARABLE = 2 << 4
+    };
+
+    // Appends specified number of HOG first-order features integrals into given vector.
+    // Param frame is an input 3-channel bgr image.
+    // Param channels is a GPU matrix of optionally shrinked channels
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+    virtual void apply(InputArray frame, OutputArray channels, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) = 0;
+
+    // Creates a specific preprocessor implementation.
+    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+    // Param bins is a number of HOG-like channels.
+    // Param flags is a channel computing extra flags.
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
+
+    virtual ~ChannelsProcessor();
+
+protected:
+    ChannelsProcessor();
+};
+
+// Implementation of soft (stage-less) cascaded detector.
+class CV_EXPORTS SCascade : public cv::Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param flags is an extra tuning flags.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
+        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detected objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+    int scales;
+
+    int flags;
+};
+
+
 }} // namespace cv { namespace softcascade {
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/softcascade/perf/perf_cuda_softcascade.cpp
similarity index 82%
rename from modules/gpu/perf/perf_softcascade.cpp
rename to modules/softcascade/perf/perf_cuda_softcascade.cpp
index 6cb3c63562..86b7c7dd07 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/softcascade/perf/perf_cuda_softcascade.cpp
@@ -1,5 +1,7 @@
 #include "perf_precomp.hpp"
 
+using std::tr1::get;
+
 #define SC_PERF_TEST_P(fixture, name, params)  \
     class fixture##_##name : public fixture {\
      public:\
@@ -25,8 +27,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";
 namespace {
     struct DetectionLess
     {
-        bool operator()(const cv::gpu::SCascade::Detection& a,
-            const cv::gpu::SCascade::Detection& b) const
+        bool operator()(const cv::softcascade::SCascade::Detection& a,
+            const cv::softcascade::SCascade::Detection& b) const
         {
             if (a.x != b.x)      return a.x < b.x;
             else if (a.y != b.y) return a.y < b.y;
@@ -39,7 +41,7 @@ namespace {
     {
         cv::Mat detections(objects);
 
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
         Detection* begin = (Detection*)(detections.ptr<char>(0));
         Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
         std::sort(begin, end, DetectionLess());
@@ -60,18 +62,18 @@ SC_PERF_TEST_P(SCascadeTest, detect,
 
 RUN_GPU(SCascadeTest, detect)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));;
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cascade.detect(colored, rois, objectBoxes);
@@ -118,13 +120,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
 
 RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -132,7 +134,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
     cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int nroi = GET_PARAM(2);
+    int nroi = get<2>(GetParam());
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
     {
@@ -163,13 +165,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
 
 RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -177,7 +179,7 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)
     cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int idx = GET_PARAM(2);
+    int idx = get<2>(GetParam());
     cv::Rect r = getFromTable(idx);
     cv::gpu::GpuMat sub(rois, r);
     sub.setTo(1);
@@ -202,18 +204,18 @@ SC_PERF_TEST_P(SCascadeTest, detectStream,
 
 RUN_GPU(SCascadeTest, detectStream)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cv::gpu::Stream s;
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
similarity index 98%
rename from modules/gpu/src/cuda/icf-sc.cu
rename to modules/softcascade/src/cuda/icf-sc.cu
index f6eb74422d..d339ef0d36 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -43,12 +43,11 @@
 #include <opencv2/gpu/device/common.hpp>
 #include <opencv2/gpu/device/saturate_cast.hpp>
 
-#include <icf.hpp>
+#include <cuda_invoker.hpp>
 #include <float.h>
 #include <stdio.h>
 
-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
 
     template <int FACTOR>
     __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
@@ -303,7 +302,7 @@ namespace icf {
                     excluded = excluded || (suppessed == i);
                 }
 
-            #if __CUDA_ARCH__ >= 120
+            #if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 120)
                 if (__all(excluded)) break;
             #endif
             }
@@ -348,7 +347,7 @@ namespace icf {
     template<typename Policy>
     struct PrefixSum
     {
-    __device static void apply(float& impact)
+    __device_inline__ static void apply(float& impact)
         {
     #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
     #pragma unroll
@@ -442,6 +441,7 @@ namespace icf {
     {
         x += area.x;
         y += area.y;
+
         int a = tex2D(thogluv, x, y);
         int b = tex2D(thogluv, x + area.z, y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
@@ -454,7 +454,7 @@ namespace icf {
 
 template<typename Policy>
 template<bool isUp>
-__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+__device_inline__ void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 {
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
     const int x = blockIdx.x;
@@ -563,5 +563,4 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
     PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 
-}
 }}}
diff --git a/modules/gpu/src/icf.hpp b/modules/softcascade/src/cuda_invoker.hpp
similarity index 81%
rename from modules/gpu/src/icf.hpp
rename to modules/softcascade/src/cuda_invoker.hpp
index e4e3f9416e..958850f0c6 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@@ -22,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and / or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -46,15 +46,16 @@
 
 #include <opencv2/gpu/device/common.hpp>
 
+using namespace cv::gpu::device;
+
 #if defined __CUDACC__
-# define __device __device__ __forceinline__
+# define __device_inline__ __device__ __forceinline__
 #else
-# define __device
+# define __device_inline__
 #endif
 
 
-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
 
 struct Octave
 {
@@ -68,20 +69,19 @@ struct Octave
     : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 };
 
-struct Level //is actually 24 bytes
+struct Level
 {
     int octave;
     int step;
 
     float relScale;
-    float scaling[2]; // calculated according to Dollal paper
+    float scaling[2];// calculated according to Dollar paper
 
-    // for 640x480 we can not get overflow
     uchar2 workRect;
     uchar2 objSize;
 
     Level(int idx, const Octave& oct, const float scale, const int w, const int h);
-    __device Level(){}
+    __device_inline__ Level(){}
 };
 
 struct Node
@@ -106,7 +106,7 @@ struct Detection
     int kind;
 
     Detection(){}
-    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    __device_inline__ Detection(int _x, int _y, uchar _w, uchar _h, float c)
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
@@ -125,8 +125,8 @@ struct CascadeInvoker
 {
     CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
 
-    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
-                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    CascadeInvoker(const cv::gpu::PtrStepSzb& _levels, const cv::gpu::PtrStepSzf& _stages,
+                   const cv::gpu::PtrStepSzb& _nodes,  const cv::gpu::PtrStepSzf& _leaves)
     : levels((const Level*)_levels.ptr()),
       stages((const float*)_stages.ptr()),
       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
@@ -141,14 +141,13 @@ struct CascadeInvoker
 
     int scales;
 
-    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+    void operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects,
         const int downscales, const cudaStream_t& stream = 0) const;
 
     template<bool isUp>
-    __device void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
+    __device_inline__ void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
 };
 
-}
 }}}
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/softcascade/src/detector_cuda.cpp
similarity index 74%
rename from modules/gpu/src/softcascade.cpp
rename to modules/softcascade/src/detector_cuda.cpp
index 5abcd6308f..6c920332fc 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -41,17 +41,18 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/gpu/stream_accessor.hpp"
 
 #if !defined (HAVE_CUDA)
-cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
+cv::softcascade::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
 
-cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
+cv::softcascade::SCascade::~SCascade() { throw_nogpu(); }
 
-bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
+bool cv::softcascade::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
 
-void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
+void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::gpu::Stream&) const { throw_nogpu(); }
 
-void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
+void cv::softcascade::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
 cv::gpu::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
  cv::gpu::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
@@ -60,9 +61,9 @@ cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int
 { throw_nogpu(); return cv::Ptr<cv::gpu::ChannelsProcessor>(0); }
 
 #else
-# include "icf.hpp"
+# include "cuda_invoker.hpp"
 
-cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+cv::softcascade::device::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 :  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
     workRect.x = cvRound(w / (float)oct.shrinkage);
@@ -81,23 +82,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
     }
 }
 
-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace softcascade { namespace device {
 
-namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream);
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream);
 
-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv);
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins);
     void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
-}
-
 }}}
 
-struct cv::gpu::SCascade::Fields
+struct cv::softcascade::SCascade::Fields
 {
     static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals, const int method)
     {
@@ -138,11 +136,9 @@ struct cv::gpu::SCascade::Fields
         FileNode fn = root[SC_OCTAVES];
         if (fn.empty()) return 0;
 
-        using namespace device::icf;
-
-        std::vector<Octave>  voctaves;
+        std::vector<device::Octave>  voctaves;
         std::vector<float>   vstages;
-        std::vector<Node>    vnodes;
+        std::vector<device::Node>    vnodes;
         std::vector<float>   vleaves;
 
         FileNodeIterator it = fn.begin(), it_end = fn.end();
@@ -158,7 +154,7 @@ struct cv::gpu::SCascade::Fields
             size.x = cvRound(origWidth * scale);
             size.y = cvRound(origHeight * scale);
 
-            Octave octave(octIndex, nweaks, shrinkage, size, scale);
+            device::Octave octave(octIndex, nweaks, shrinkage, size, scale);
             CV_Assert(octave.stages > 0);
             voctaves.push_back(octave);
 
@@ -227,7 +223,7 @@ struct cv::gpu::SCascade::Fields
                     rect.w = saturate_cast<uchar>(r.height);
 
                     unsigned int channel = saturate_cast<unsigned int>(feature_channels[featureIdx]);
-                    vnodes.push_back(Node(rect, channel, th));
+                    vnodes.push_back(device::Node(rect, channel, th));
                 }
 
                 intfns = octfn[SC_LEAF];
@@ -239,13 +235,13 @@ struct cv::gpu::SCascade::Fields
             }
         }
 
-        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
+        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(device::Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
         CV_Assert(!hoctaves.empty());
 
         cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
         CV_Assert(!hstages.empty());
 
-        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
+        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(device::Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
         CV_Assert(!hnodes.empty());
 
         cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
@@ -272,8 +268,7 @@ struct cv::gpu::SCascade::Fields
 
     int createLevels(const int fh, const int fw)
     {
-        using namespace device::icf;
-        std::vector<Level> vlevels;
+        std::vector<device::Level> vlevels;
         float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);
 
         float scale = minScale;
@@ -286,7 +281,7 @@ struct cv::gpu::SCascade::Fields
             float logScale = ::log(scale);
             int fit = fitOctave(voctaves, logScale);
 
-            Level level(fit, voctaves[fit], scale, width, height);
+            device::Level level(fit, voctaves[fit], scale, width, height);
 
             if (!width || !height)
                 break;
@@ -300,7 +295,7 @@ struct cv::gpu::SCascade::Fields
             scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
         }
 
-        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
+        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(device::Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
         CV_Assert(!hlevels.empty());
         levels.upload(hlevels);
         downscales = dcs;
@@ -334,7 +329,7 @@ struct cv::gpu::SCascade::Fields
         preprocessor = ChannelsProcessor::create(shrinkage, 6, method);
     }
 
-    void detect(cv::gpu::GpuMat& objects, Stream& s) const
+    void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
     {
         if (s)
             s.enqueueMemSet(objects, 0);
@@ -343,16 +338,16 @@ struct cv::gpu::SCascade::Fields
 
         cudaSafeCall( cudaGetLastError());
 
-        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
-        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
+        device::CascadeInvoker<device::GK107PolicyX4> invoker
+        = device::CascadeInvoker<device::GK107PolicyX4>(levels, stages, nodes, leaves);
 
-        cudaStream_t stream = StreamAccessor::getStream(s);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
         invoker(mask, hogluv, objects, downscales, stream);
     }
 
-    void suppress(GpuMat& objects, Stream& s)
+    void suppress(cv::gpu::GpuMat& objects, cv::gpu::Stream& s)
     {
-        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
         ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
 
         if (s)
@@ -366,20 +361,20 @@ struct cv::gpu::SCascade::Fields
             suppressed.setTo(0);
         }
 
-        cudaStream_t stream = StreamAccessor::getStream(s);
-        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        device::suppress(objects, overlaps, ndetections, suppressed, stream);
     }
 
 private:
 
-    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
-    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
+    typedef std::vector<device::Octave>::const_iterator  octIt_t;
+    static int fitOctave(const std::vector<device::Octave>& octs, const float& logFactor)
     {
         float minAbsLog = FLT_MAX;
         int res =  0;
         for (int oct = 0; oct < (int)octs.size(); ++oct)
         {
-            const device::icf::Octave& octave =octs[oct];
+            const device::Octave& octave =octs[oct];
             float logOctave = ::log(octave.scale);
             float logAbsScale = ::fabs(logFactor - logOctave);
 
@@ -410,37 +405,37 @@ public:
 
 
     // 160x120x10
-    GpuMat shrunk;
+    cv::gpu::GpuMat shrunk;
 
     // temporal mat for integral
-    GpuMat integralBuffer;
+    cv::gpu::GpuMat integralBuffer;
 
     // 161x121x10
-    GpuMat hogluv;
+    cv::gpu::GpuMat hogluv;
 
 
     // used for suppression
-    GpuMat suppressed;
+    cv::gpu::GpuMat suppressed;
     // used for area overlap computing during
-    GpuMat overlaps;
+    cv::gpu::GpuMat overlaps;
 
 
     // Cascade from xml
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat levels;
+    cv::gpu::GpuMat octaves;
+    cv::gpu::GpuMat stages;
+    cv::gpu::GpuMat nodes;
+    cv::gpu::GpuMat leaves;
+    cv::gpu::GpuMat levels;
 
 
     // For ROI
-    GpuMat mask;
-    GpuMat genRoiTmp;
+    cv::gpu::GpuMat mask;
+    cv::gpu::GpuMat genRoiTmp;
 
-//     GpuMat collected;
+//     cv::gpu::GpuMat collected;
 
 
-    std::vector<device::icf::Octave> voctaves;
+    std::vector<device::Octave> voctaves;
 
 //     DeviceInfo info;
 
@@ -453,19 +448,19 @@ public:
     };
 };
 
-cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
+cv::softcascade::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
 : fields(0),  minScale(mins), maxScale(maxs), scales(sc), flags(fl) {}
 
-cv::gpu::SCascade::~SCascade() { delete fields; }
+cv::softcascade::SCascade::~SCascade() { delete fields; }
 
-bool cv::gpu::SCascade::load(const FileNode& fn)
+bool cv::softcascade::SCascade::load(const FileNode& fn)
 {
     if (fields) delete fields;
     fields = Fields::parseCascade(fn, (float)minScale, (float)maxScale, scales, flags);
     return fields != 0;
 }
 
-void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, Stream& s) const
+void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const
 {
     CV_Assert(fields);
 
@@ -473,11 +468,11 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
     int type = _image.type();
     CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty()));
 
-    const GpuMat image = _image.getGpuMat();
+    const cv::gpu::GpuMat image = _image.getGpuMat();
 
     if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1);
 
-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+    cv::gpu::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
 
     /// roi
     Fields& flds = *fields;
@@ -510,13 +505,13 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
 
     if ( (flags && NMS_MASK) != NO_REJECT)
     {
-        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        cv::gpu::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
         flds.suppress(objects, s);
         flds.suppressed.copyTo(spr);
     }
 }
 
-void cv::gpu::SCascade::read(const FileNode& fn)
+void cv::softcascade::SCascade::read(const FileNode& fn)
 {
     Algorithm::read(fn);
 }
@@ -528,7 +523,7 @@ using cv::OutputArray;
 using cv::gpu::Stream;
 using cv::gpu::GpuMat;
 
-inline void setZero(cv::gpu::GpuMat& m, Stream& s)
+inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
 {
     if (s)
         s.enqueueMemSet(m, 0);
@@ -536,17 +531,17 @@ inline void setZero(cv::gpu::GpuMat& m, Stream& s)
         m.setTo(0);
 }
 
-struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
+struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
 {
-    GenricPreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    GenricPreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
     virtual ~GenricPreprocessor() {}
 
-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
-        const GpuMat frame = _frame.getGpuMat();
+        const cv::gpu::GpuMat frame = _frame.getGpuMat();
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
+        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
 
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
@@ -561,53 +556,53 @@ struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
 
 private:
 
-    void createHogBins(Stream& s)
+    void createHogBins(cv::gpu::Stream& s)
     {
         static const int fw = gray.cols;
         static const int fh = gray.rows;
 
         fplane.create(fh * HOG_BINS, fw, CV_32FC1);
 
-        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
-        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+        cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
+        cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
         cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
         cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
 
-        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
-        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
+        cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
+        cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
 
         cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
 
         // normalize magnitude to uchar interval and angles to 6 bins
-        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
-        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
+        cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
+        cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
         cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
         cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
 
         //create uchar magnitude
-        GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
+        cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
         if (s)
             s.enqueueConvert(nmag, cmag, CV_8UC1);
         else
             nmag.convertTo(cmag, CV_8UC1);
 
         cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
-        cv::gpu::device::icf::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
+        cv::softcascade::device::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
     }
 
-    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
+    void createLuvBins(const cv::gpu::GpuMat& colored, cv::gpu::Stream& s)
     {
         static const int fw = colored.cols;
         static const int fh = colored.rows;
 
         cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
-        std::vector<GpuMat> splited;
+        std::vector<cv::gpu::GpuMat> splited;
         for(int i = 0; i < LUV_BINS; ++i)
         {
-            splited.push_back(GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
+            splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
 
         cv::gpu::split(luv, splited, s);
@@ -618,62 +613,62 @@ private:
     const int shrinkage;
     const int bins;
 
-    GpuMat gray;
-    GpuMat luv;
-    GpuMat channels;
+    cv::gpu::GpuMat gray;
+    cv::gpu::GpuMat luv;
+    cv::gpu::GpuMat channels;
 
     // preallocated buffer for floating point operations
-    GpuMat fplane;
-    GpuMat sobelBuf;
+    cv::gpu::GpuMat fplane;
+    cv::gpu::GpuMat sobelBuf;
 };
 
 
-struct SeparablePreprocessor : public cv::gpu::ChannelsProcessor
+struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
 {
-    SeparablePreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    SeparablePreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
     virtual ~SeparablePreprocessor() {}
 
-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
-        const GpuMat frame = _frame.getGpuMat();
+        const cv::gpu::GpuMat frame = _frame.getGpuMat();
         cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
+        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
 
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
         cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
-        cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
+        cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
         cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
-        cv::gpu::device::icf::bgr2Luv(bgr, luv);
-        cv::gpu::device::icf::shrink(channels, shrunk);
+        cv::softcascade::device::bgr2Luv(bgr, luv);
+        cv::softcascade::device::shrink(channels, shrunk);
     }
 
 private:
     const int shrinkage;
     const int bins;
 
-    GpuMat bgr;
-    GpuMat gray;
-    GpuMat channels;
+    cv::gpu::GpuMat bgr;
+    cv::gpu::GpuMat gray;
+    cv::gpu::GpuMat channels;
 };
 
 }
 
-cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int s, const int b, const int m)
+cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int s, const int b, const int m)
 {
     CV_Assert((m && SEPARABLE) || (m && GENERIC));
 
     if (m && GENERIC)
-        return cv::Ptr<cv::gpu::ChannelsProcessor>(new GenricPreprocessor(s, b));
+        return cv::Ptr<cv::softcascade::ChannelsProcessor>(new GenricPreprocessor(s, b));
 
-    return cv::Ptr<cv::gpu::ChannelsProcessor>(new SeparablePreprocessor(s, b));
+    return cv::Ptr<cv::softcascade::ChannelsProcessor>(new SeparablePreprocessor(s, b));
 }
 
-cv::gpu::ChannelsProcessor::ChannelsProcessor() { }
-cv::gpu::ChannelsProcessor::~ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::~ChannelsProcessor() { }
 
 #endif
diff --git a/modules/softcascade/src/softcascade_init.cpp b/modules/softcascade/src/softcascade_init.cpp
index 48ad46ab2d..902ad48a1d 100644
--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@@ -51,11 +51,16 @@ CV_INIT_ALGORITHM(Detector, "SoftCascade.Detector",
                   obj.info()->addParam(obj, "scales",      obj.scales);
                   obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
 
+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale", obj.minScale);
+                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
+                  obj.info()->addParam(obj, "scales",   obj.scales));
 
 bool initModule_softcascade(void)
 {
+    Ptr<Algorithm> sc = createSCascade();
     Ptr<Algorithm> sc1 = createDetector();
-    return (sc1->info() != 0);
+    return (sc1->info() != 0) && (sc->info() != 0);
 }
 
 } }
\ No newline at end of file
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/softcascade/test/test_cuda_softcascade.cpp
similarity index 80%
rename from modules/gpu/test/test_softcascade.cpp
rename to modules/softcascade/test/test_cuda_softcascade.cpp
index c08dc06c8f..f97a26ad30 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@@ -41,10 +41,9 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
 
-#ifdef HAVE_CUDA
-
-using cv::gpu::GpuMat;
+using std::tr1::get;
 
 // show detection results on input image with cv::imshow
 //#define SHOW_DETECTIONS
@@ -59,7 +58,7 @@ using cv::gpu::GpuMat;
 
 static std::string path(std::string relative)
 {
-    return cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/" + relative;
+    return cvtest::TS::ptr()->get_data_path() + "cascadeandhog/" + relative;
 }
 
 TEST(SCascadeTest, readCascade)
@@ -67,7 +66,7 @@ TEST(SCascadeTest, readCascade)
     std::string xml = path("cascades/inria_caltech-17.01.2013.xml");
     cv::FileStorage fs(xml, cv::FileStorage::READ);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     ASSERT_TRUE(fs.isOpened());
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -75,7 +74,7 @@ TEST(SCascadeTest, readCascade)
 
 namespace
 {
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
 
     cv::Rect getFromTable(int idx)
     {
@@ -97,7 +96,6 @@ namespace
         return rois[idx];
     }
 
-
     void print(std::ostream &out, const Detection& d)
     {
     #if defined SHOW_DETECTIONS
@@ -156,36 +154,36 @@ namespace
 #endif
 }
 
-PARAM_TEST_CASE(SCascadeTestRoi, cv::gpu::DeviceInfo, std::string, std::string, int)
+class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> >
 {
     virtual void SetUp()
     {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
     }
 };
 
-GPU_TEST_P(SCascadeTestRoi, Detect)
+TEST_P(SCascadeTestRoi, Detect)
 {
-    cv::Mat coloredCpu = cv::imread(path(GET_PARAM(2)));
+    cv::Mat coloredCpu = cv::imread(path(get<2>(GetParam())));
     ASSERT_FALSE(coloredCpu.empty());
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(path(GET_PARAM(1)), cv::FileStorage::READ);
+    cv::FileStorage fs(path(get<1>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int nroi = GET_PARAM(3);
+    int nroi = get<3>(GetParam());
     cv::Mat result(coloredCpu);
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
     {
         cv::Rect r = getFromTable(rng(10));
-        GpuMat sub(rois, r);
+        cv::gpu::GpuMat sub(rois, r);
         sub.setTo(1);
         cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
@@ -194,7 +192,7 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
     cascade.detect(colored, rois, objectBoxes);
 
     cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
 
     Detection* dts = ((Detection*)dt.data) + 1;
     int* count = dt.ptr<int>(0);
@@ -211,15 +209,13 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
     SHOW(result);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestRoi, testing::Combine(
-    ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestRoi, testing::Combine(
+    testing::ValuesIn(DeviceManager::instance().values()),
     testing::Values(std::string("cascades/inria_caltech-17.01.2013.xml"),
                     std::string("cascades/sc_cvpr_2012_to_opencv_new_format.xml")),
     testing::Values(std::string("images/image_00000000_0.png")),
     testing::Range(0, 5)));
 
-////////////////////////////////////////
-
 namespace {
 
 struct Fixture
@@ -232,23 +228,24 @@ struct Fixture
 };
 }
 
-PARAM_TEST_CASE(SCascadeTestAll, cv::gpu::DeviceInfo, Fixture)
+typedef std::tr1::tuple<cv::gpu::DeviceInfo, Fixture> SCascadeTestAllFixture;
+class SCascadeTestAll : public ::testing::TestWithParam<SCascadeTestAllFixture>
 {
-
+protected:
     std::string xml;
     int expected;
 
     virtual void SetUp()
     {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
-        xml = path(GET_PARAM(1).path);
-        expected = GET_PARAM(1).expected;
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        xml = path(get<1>(GetParam()).path);
+        expected = get<1>(GetParam()).expected;
     }
 };
 
-GPU_TEST_P(SCascadeTestAll, detect)
+TEST_P(SCascadeTestAll, detect)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     cv::FileStorage fs(xml, cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
@@ -258,12 +255,12 @@ GPU_TEST_P(SCascadeTestAll, detect)
     cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
     ASSERT_FALSE(coloredCpu.empty());
 
-    GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cascade.detect(colored, rois, objectBoxes);
 
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
     cv::Mat dt(objectBoxes);
 
 
@@ -283,9 +280,9 @@ GPU_TEST_P(SCascadeTestAll, detect)
     ASSERT_EQ(*count, expected);
 }
 
-GPU_TEST_P(SCascadeTestAll, detectStream)
+TEST_P(SCascadeTestAll, detectStream)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     cv::FileStorage fs(xml, cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
@@ -295,7 +292,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
     cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
     ASSERT_FALSE(coloredCpu.empty());
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(cv::Scalar::all(1));
 
     cv::gpu::Stream s;
@@ -304,14 +301,12 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
     cascade.detect(colored, rois, objectBoxes, s);
     s.waitForCompletion();
 
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
     cv::Mat detections(objectBoxes);
     int a = *(detections.ptr<int>(0));
     ASSERT_EQ(a, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestAll, testing::Combine( ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestAll, testing::Combine( ALL_DEVICES,
                     testing::Values(Fixture("cascades/inria_caltech-17.01.2013.xml", 7),
-                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
-
-#endif
+                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
\ No newline at end of file
diff --git a/modules/softcascade/test/test_main.cpp b/modules/softcascade/test/test_main.cpp
index d3999d4833..ab0e8615bd 100644
--- a/modules/softcascade/test/test_main.cpp
+++ b/modules/softcascade/test/test_main.cpp
@@ -42,4 +42,4 @@
 
 #include "test_precomp.hpp"
 
-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("cv")
\ No newline at end of file
diff --git a/modules/softcascade/test/utility.cpp b/modules/softcascade/test/utility.cpp
new file mode 100644
index 0000000000..cb3b1fbf49
--- /dev/null
+++ b/modules/softcascade/test/utility.cpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+using namespace testing::internal;
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+
+bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+{
+    return TargetArchs::builtWith(feature) && info.supports(feature);
+}
+
+DeviceManager& DeviceManager::instance()
+{
+    static DeviceManager obj;
+    return obj;
+}
+
+void DeviceManager::load(int i)
+{
+    devices_.clear();
+    devices_.reserve(1);
+
+    std::ostringstream msg;
+
+    if (i < 0 || i >= getCudaEnabledDeviceCount())
+    {
+        msg << "Incorrect device number - " << i;
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    DeviceInfo info(i);
+
+    if (!info.isCompatible())
+    {
+        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    devices_.push_back(info);
+}
+
+void DeviceManager::loadAll()
+{
+    int deviceCount = getCudaEnabledDeviceCount();
+
+    devices_.clear();
+    devices_.reserve(deviceCount);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+        if (info.isCompatible())
+        {
+            devices_.push_back(info);
+        }
+    }
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/softcascade/test/utility.hpp b/modules/softcascade/test/utility.hpp
new file mode 100644
index 0000000000..e6b840c534
--- /dev/null
+++ b/modules/softcascade/test/utility.hpp
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+#define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+//! return true if device supports specified feature and gpu module was built with support the feature.
+bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+
+class DeviceManager
+{
+public:
+    static DeviceManager& instance();
+
+    void load(int i);
+    void loadAll();
+
+    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+private:
+    std::vector<cv::gpu::DeviceInfo> devices_;
+    DeviceManager() {loadAll();}
+};
+
+#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
+
+
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 6abb7e5af8..6d20fc34d4 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -1,7 +1,7 @@
 SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
                                      opencv_ml opencv_video opencv_objdetect opencv_features2d
                                      opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree)
+                                     opencv_nonfree opencv_softcascade)
 
 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
 
diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp
index 5f1adaf6cf..e3683583a9 100644
--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
@@ -1,4 +1,5 @@
 #include <opencv2/gpu/gpu.hpp>
+#include <opencv2/softcascade/softcascade.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <iostream>
 
@@ -46,7 +47,7 @@ int main(int argc, char** argv)
     float maxScale =  parser.get<float>("max_scale");
     int scales     =  parser.get<int>("total_scales");
 
-    using cv::gpu::SCascade;
+    using cv::softcascade::SCascade;
     SCascade cascade(minScale, maxScale, scales);
 
     if (!cascade.load(fs.getFirstTopLevelNode()))
@@ -79,7 +80,7 @@ int main(int argc, char** argv)
         cascade.detect(dframe, roi, objects);
 
         cv::Mat dt(objects);
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
 
         Detection* dts = ((Detection*)dt.data) + 1;
         int* count = dt.ptr<int>(0);