NVIDIA_OPTICAL_FLOW_2_0_INTEGRATION

5 years ago · 582fe44b7a
parent 0b6b8ff9dd
commit 582fe44b7a
9 changed files with 1238 additions and 110 deletions
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@ -8,21 +8,21 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-d

 ocv_define_module(cudaoptflow opencv_video opencv_optflow opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)

-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT "79c6cee80a2df9a196f20afd6b598a9810964c32")
-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5 "ca5acedee6cb45d0ec610a6732de5c15")
-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_1_0_Headers")
-ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}.zip"
-               HASH ${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5}
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT "edb50da3cf849840d680249aa6dbef248ebce2ca")
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_MD5 "a73cd48b18dcc0cc8933b30796074191")
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_2_0_Headers")
+ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT}.zip"
+               HASH ${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_MD5}
               URL
                 "https://github.com/NVIDIA/NVIDIAOpticalFlowSDK/archive/"
-               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}"
-               STATUS NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS
+               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH}"
+               STATUS NVIDIA_OPTICAL_FLOW_2_0_HEADERS_DOWNLOAD_SUCCESS
               ID "NVIDIA_OPTICAL_FLOW"
               UNPACK RELATIVE_URL)

-if(NOT NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS)
-  message(STATUS "Failed to download NVIDIA_Optical_Flow_1_0 Headers")
+if(NOT NVIDIA_OPTICAL_FLOW_2_0_HEADERS_DOWNLOAD_SUCCESS)
+  message(STATUS "Failed to download NVIDIA_Optical_Flow_2_0 Headers")
 else()
  add_definitions(-DHAVE_NVIDIA_OPTFLOW=1)
-  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}")
+  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT}")
 endif()
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@ -392,9 +392,9 @@ public:
 /** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 1.0.
@note
 - A sample application demonstrating the use of NVIDIA Optical Flow can be found at
-opencv_source_code/samples/gpu/nvidia_optical_flow.cpp
+opencv_contrib_source_code/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
 - An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
-opencv_source_code/samples/gpu/optical_flow.cpp
+opencv_contrib_source_code/modules/cudaoptflow/samples/optical_flow.cpp
 */

 class CV_EXPORTS_W NvidiaOpticalFlow_1_0 : public NvidiaHWOpticalFlow
@ -417,18 +417,16 @@ public:
    * using nearest neighbour upsampling method.

    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
-    @param width Width of the input image in pixels for which these flow vectors were generated.
-    @param height Height of the input image in pixels for which these flow vectors were generated.
+    @param imageSize Size of the input image in pixels for which these flow vectors were generated.
    @param gridSize Granularity of the optical flow vectors returned by calc() function. Can be queried using getGridSize().
    @param upsampledFlow Buffer of type CV_32FC2, containing upsampled flow vectors, each flow vector for 1 pixel, in the pitch-linear layout.
    */
-    CV_WRAP virtual void upSampler(InputArray flow, int width, int height,
+    CV_WRAP virtual void upSampler(InputArray flow, cv::Size imageSize,
        int gridSize, InputOutputArray upsampledFlow) = 0;

    /** @brief Instantiate NVIDIA Optical Flow

-    @param width Width of input image in pixels.
-    @param height Height of input image in pixels.
+    @param imageSize Size of input image in pixels.
    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
                      Defaults to NV_OF_PERF_LEVEL_SLOW.
    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
@ -445,10 +443,142 @@ public:
                        If output stream is not set, the execute function will use default stream which is NULL stream;
    */
    CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
-        int width,
-        int height,
+        cv::Size imageSize,
        cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset
-        = cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+        = cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
+};
+
+/** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 2.0.
+@note
+- A sample application demonstrating the use of NVIDIA Optical Flow can be found at
+opencv_contrib_source_code/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+- An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
+opencv_contrib_source_code/modules/cudaoptflow/samples/optical_flow.cpp
+*/
+
+class CV_EXPORTS_W NvidiaOpticalFlow_2_0 : public NvidiaHWOpticalFlow
+{
+public:
+    /**
+    * Supported optical flow performance levels.
+    */
+    enum NVIDIA_OF_PERF_LEVEL
+    {
+        NV_OF_PERF_LEVEL_UNDEFINED,
+        NV_OF_PERF_LEVEL_SLOW = 5,                   /**< Slow perf level results in lowest performance and best quality */
+        NV_OF_PERF_LEVEL_MEDIUM = 10,                /**< Medium perf level results in low performance and medium quality */
+        NV_OF_PERF_LEVEL_FAST = 20,                  /**< Fast perf level results in high performance and low quality */
+        NV_OF_PERF_LEVEL_MAX
+    };
+
+    /**
+    * Supported grid size for output buffer.
+    */
+    enum NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE
+    {
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_UNDEFINED,
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_1 = 1,          /**< Output buffer grid size is 1x1  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_2 = 2,          /**< Output buffer grid size is 2x2  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_4 = 4,          /**< Output buffer grid size is 4x4  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_MAX
+    };
+
+    /**
+    * Supported grid size for hint buffer.
+    */
+    enum NVIDIA_OF_HINT_VECTOR_GRID_SIZE
+    {
+        NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED,
+        NV_OF_HINT_VECTOR_GRID_SIZE_1 = 1,            /**< Hint buffer grid size is 1x1.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_2 = 2,            /**< Hint buffer grid size is 2x2.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_4 = 4,            /**< Hint buffer grid size is 4x4.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_8 = 8,            /**< Hint buffer grid size is 8x8.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_MAX
+    };
+
+    /** @brief convertToFloat() helper function converts the hardware-generated flow vectors to floating point representation (1 flow vector for gridSize).
+    * gridSize can be queried via function getGridSize().
+
+    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
+    @param floatFlow Buffer of type CV_32FC2, containing flow vectors in floating point representation, each flow vector for 1 pixel per gridSize, in the pitch-linear layout.
+    */
+    CV_WRAP virtual void convertToFloat(InputArray flow, InputOutputArray floatFlow) = 0;
+
+    /** @brief Instantiate NVIDIA Optical Flow
+
+    @param imageSize Size of input image in pixels.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param outputGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about output grid sizes.
+                          Defaults to NV_OF_OUTPUT_VECTOR_GRID_SIZE_1.
+    @param hintGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about hint grid sizes.
+                        Defaults to NV_OF_HINT_VECTOR_GRID_SIZE_1.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_2_0> create(
+        cv::Size imageSize,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_PERF_LEVEL_SLOW,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_1,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
+
+    /**  @brief Instantiate NVIDIA Optical Flow with ROI Feature
+
+    @param imageSize Size of input image in pixels.
+    @param roiData Pointer to ROI data.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param outputGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about output grid sizes.
+                          Defaults to NV_OF_OUTPUT_VECTOR_GRID_SIZE_1.
+    @param hintGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about hint grid sizes.
+                        Defaults to NV_OF_HINT_VECTOR_GRID_SIZE_1.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_2_0> create(
+        cv::Size imageSize,
+        std::vector<Rect> roiData,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_PERF_LEVEL_SLOW,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_1,
        bool enableTemporalHints = false,
        bool enableExternalHints = false,
        bool enableCostBuffer = false,
--- a/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
+++ b/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
@ -0,0 +1,36 @@
+import os
+import cv2 as cv
+import numpy as np
+
+from tests_common import NewOpenCVTests, unittest
+
+class nvidiaopticalflow_test(NewOpenCVTests):
+    def setUp(self):
+        super(nvidiaopticalflow_test, self).setUp()
+        if not cv.cuda.getCudaEnabledDeviceCount():
+            self.skipTest("No CUDA-capable device is detected")
+
+    @unittest.skipIf('OPENCV_TEST_DATA_PATH' not in os.environ,
+                        "OPENCV_TEST_DATA_PATH is not defined")
+    def test_calc(self):
+        frame1 = os.environ['OPENCV_TEST_DATA_PATH'] + '/gpu/opticalflow/frame0.png'
+        frame2 = os.environ['OPENCV_TEST_DATA_PATH'] + '/gpu/opticalflow/frame1.png'
+
+        npMat1 = cv.cvtColor(cv.imread(frame1),cv.COLOR_BGR2GRAY)
+        npMat2 = cv.cvtColor(cv.imread(frame2),cv.COLOR_BGR2GRAY)
+
+        cuMat1 = cv.cuda_GpuMat(npMat1)
+        cuMat2 = cv.cuda_GpuMat(npMat2)
+        try:
+            nvof = cv.cuda_NvidiaOpticalFlow_1_0.create(cuMat1.shape[1], cuMat1.shape[0], 5, False, False, False, 0)
+            flow = nvof.calc(cuMat1, cuMat2, None)
+            self.assertTrue(flow.shape[1] > 0 and flow.shape[0] > 0)
+            flowUpSampled = nvof.upSampler(flow[0], cuMat1.shape[1], cuMat1.shape[0], nvof.getGridSize(), None)
+            nvof.collectGarbage()
+        except cv.error as e:
+            if e.code == cv.Error.StsBadFunc or e.code == cv.Error.StsBadArg or e.code == cv.Error.StsNullPtr:
+                self.skipTest("Algorithm is not supported in the current environment")
+        self.assertTrue(flowUpSampled.shape[1] > 0 and flowUpSampled.shape[0] > 0)
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@ -339,13 +339,8 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,

    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(frame1.empty());
-
-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
+    Stream inputStream;
+    Stream outputStream;

    if (PERF_RUN_CUDA())
    {
@ -355,9 +350,9 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,
        cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
        try
        {
-            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
                cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST,
-                enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+                false, false, false, 0, inputStream, outputStream);
        }
        catch (const cv::Exception& e)
        {
@ -376,6 +371,63 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,

        CUDA_SANITY_CHECK(u, 1e-10);
        CUDA_SANITY_CHECK(v, 1e-10);
+
+        d_nvof->collectGarbage();
+    }
+}
+
+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_2_0
+
+PERF_TEST_P(ImagePair, NvidiaOpticalFlow_2_0,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(10);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1;
+    const cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_1;
+    Stream inputStream;
+    Stream outputStream;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat d_flow;
+        cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+        try
+        {
+            d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+                cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, outGridSize, hintGridSize,
+              false, false, false, 0, inputStream, outputStream);
+        }
+        catch (const cv::Exception& e)
+        {
+            if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+                throw SkipTestException("Current configuration is not supported");
+            throw;
+        }
+
+        TEST_CYCLE() d_nvof->calc(d_frame0, d_frame1, d_flow);
+
+        cv::cuda::GpuMat flow[2];
+        cv::cuda::split(d_flow, flow);
+
+        cv::cuda::GpuMat u = flow[0];
+        cv::cuda::GpuMat v = flow[1];
+
+        CUDA_SANITY_CHECK(u, 1e-10);
+        CUDA_SANITY_CHECK(v, 1e-10);
+
+        d_nvof->collectGarbage();
    }
 }

--- a/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+++ b/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
@ -2,6 +2,7 @@
 #include <iostream>
 #include <fstream>
 #include <iomanip>
+#include <iterator>

 #include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
@ -11,7 +12,6 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/video/tracking.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::cuda;

@ -131,12 +131,88 @@ static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy
    }
 }

+/*
+ROI config file format.
+numrois 3
+roi0 640 96 1152 192
+roi1 640 64 896 864
+roi2 640 960 256 32
+*/
+bool parseROI(std::string ROIFileName, std::vector<Rect>& roiData)
+{
+    std::string str;
+    uint32_t nRois = 0;
+    std::ifstream hRoiFile;
+    hRoiFile.open(ROIFileName, std::ios::in);
+
+    if (hRoiFile.is_open())
+    {
+        while (std::getline(hRoiFile, str))
+        {
+            std::istringstream iss(str);
+            std::vector<std::string> tokens{ std::istream_iterator<std::string>{iss},
+                std::istream_iterator<std::string>{} };
+
+            if (tokens.size() == 0) continue; // if empty line, coninue
+
+            transform(tokens[0].begin(), tokens[0].end(), tokens[0].begin(), ::tolower);
+            if (tokens[0] == "numrois")
+            {
+                nRois = atoi(tokens[1].data());
+            }
+            else if (tokens[0].rfind("roi", 0) == 0)
+            {
+                cv::Rect roi;
+                roi.x = atoi(tokens[1].data());
+                roi.y = atoi(tokens[2].data());
+                roi.width = atoi(tokens[3].data());
+                roi.height = atoi(tokens[4].data());
+                roiData.push_back(roi);
+            }
+            else if (tokens[0].rfind("#", 0) == 0)
+            {
+                continue;
+            }
+            else
+            {
+                std::cout << "Unidentified keyword in roi config file " << tokens[0] << std::endl;
+                hRoiFile.close();
+                return false;
+            }
+        }
+    }
+    else
+    {
+        std::cout << "Unable to open ROI file " << std::endl;
+        return false;
+    }
+    if (nRois != roiData.size())
+    {
+        std::cout << "NumRois(" << nRois << ")and specified roi rects (" << roiData.size() << ")are not matching " << std::endl;
+        hRoiFile.close();
+        return false;
+    }
+    hRoiFile.close();
+    return true;
+}
+
 int main(int argc, char **argv)
 {
-    std::unordered_map<std::string, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
-        { "slow", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
-        { "medium", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
-        { "fast", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+    std::unordered_map<std::string, NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
+        { "slow", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
+        { "medium", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
+        { "fast", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+
+    std::unordered_map<int, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE> outputGridSize = {
+        { 1, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1 },
+        { 2, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_2 },
+        { 4, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_4 } };
+
+    std::unordered_map<int, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE> hintGridSize = {
+        { 1, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_1 },
+        { 2, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_2 },
+        { 4, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_4 },
+        { 8, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_8 } };

    try
    {
@ -145,7 +221,10 @@ int main(int argc, char **argv)
            "{ r right  | ../data/basketball2.png | specify right image }"
            "{ g gpuid  | 0 | cuda device index}"
            "{ p preset | slow | perf preset for OF algo [ options : slow, medium, fast ]}"
+            "{ og outputGridSize | 1 | Output grid size of OF vector [ options : 1, 2, 4 ]}"
+            "{ hg hintGridSize | 1 | Hint grid size of OF vector [ options : 1, 2, 4, 8 ]}"
            "{ o output | OpenCVNvOF.flo | output flow vector file in middlebury format}"
+            "{ rc roiConfigFile | | Region of Interest config file }"
            "{ th enableTemporalHints | false | Enable temporal hints}"
            "{ eh enableExternalHints | false | Enable external hints}"
            "{ cb enableCostBuffer | false | Enable output cost buffer}"
@ -159,60 +238,93 @@ int main(int argc, char **argv)
            return 0;
        }

-        string pathL = cmd.get<string>("left");
-        string pathR = cmd.get<string>("right");
-        string preset = cmd.get<string>("preset");
-        string output = cmd.get<string>("output");
+        std::string pathL = cmd.get<std::string>("left");
+        std::string pathR = cmd.get<std::string>("right");
+        std::string preset = cmd.get<std::string>("preset");
+        std::string output = cmd.get<std::string>("output");
+        std::string roiConfiFile = cmd.get<std::string>("roiConfigFile");
        bool enableExternalHints = cmd.get<bool>("enableExternalHints");
        bool enableTemporalHints = cmd.get<bool>("enableTemporalHints");
        bool enableCostBuffer = cmd.get<bool>("enableCostBuffer");
        int gpuId = cmd.get<int>("gpuid");
+        int outputBufferGridSize = cmd.get<int>("outputGridSize");
+        int hintBufferGridSize = cmd.get<int>("hintGridSize");

-        if (pathL.empty()) cout << "Specify left image path\n";
-        if (pathR.empty()) cout << "Specify right image path\n";
-        if (preset.empty()) cout << "Specify perf preset for OpticalFlow algo\n";
+        if (pathL.empty()) std::cout << "Specify left image path" << std::endl;
+        if (pathR.empty()) std::cout << "Specify right image path" << std::endl;
+        if (preset.empty()) std::cout << "Specify perf preset for OpticalFlow algo" << std::endl;
        if (pathL.empty() || pathR.empty()) return 0;

-        auto search = presetMap.find(preset);
-        if (search == presetMap.end())
+        auto p = presetMap.find(preset);
+        if (p == presetMap.end())
        {
            std::cout << "Invalid preset level : " << preset << std::endl;
            return 0;
        }
-        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset = search->second;
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset = p->second;
+
+        auto o = outputGridSize.find(outputBufferGridSize);
+        if (o == outputGridSize.end())
+        {
+            std::cout << "Invalid output grid size: " << outputBufferGridSize << std::endl;
+            return 0;
+        }
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outBufGridSize = o->second;
+
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintBufGridSize =
+            NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED;
+        if (enableExternalHints)
+        {
+            auto h = hintGridSize.find(hintBufferGridSize);
+            if (h == hintGridSize.end())
+            {
+                std::cout << "Invalid hint grid size: " << hintBufferGridSize << std::endl;
+                return 0;
+            }
+            hintBufGridSize = h->second;
+        }
+
+        std::vector<Rect> roiData;
+
+        if (!roiConfiFile.empty())
+        {
+            if (!parseROI(roiConfiFile, roiData))
+            {
+                std::cout << "Wrong Region of Interest config file, proceeding without ROI" << std::endl;
+            }
+        }

        Mat frameL = imread(pathL, IMREAD_GRAYSCALE);
        Mat frameR = imread(pathR, IMREAD_GRAYSCALE);
-        if (frameL.empty()) cout << "Can't open '" << pathL << "'\n";
-        if (frameR.empty()) cout << "Can't open '" << pathR << "'\n";
+        if (frameL.empty()) std::cout << "Can't open '" << pathL << "'" << std::endl;
+        if (frameR.empty()) std::cout << "Can't open '" << pathR << "'" << std::endl;
        if (frameL.empty() || frameR.empty()) return -1;

-        Ptr<NvidiaOpticalFlow_1_0> nvof = NvidiaOpticalFlow_1_0::create(
-            frameL.size().width, frameL.size().height, perfPreset,
+        Ptr<NvidiaOpticalFlow_2_0> nvof = NvidiaOpticalFlow_2_0::create(
+            frameL.size(), roiData, perfPreset, outBufGridSize, hintBufGridSize,
            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuId);

-        Mat flowx, flowy, flowxy, upsampledFlowXY, image;
+        Mat flowx, flowy, flowxy, floatFlow, image;

        nvof->calc(frameL, frameR, flowxy);

-        nvof->upSampler(flowxy, frameL.size().width, frameL.size().height,
-            nvof->getGridSize(), upsampledFlowXY);
+        nvof->convertToFloat(flowxy, floatFlow);

-        if (output.size() != 0)
+        if (!output.empty())
        {
-            if (!writeOpticalFlow(output, upsampledFlowXY))
-                cout << "Failed to save Flow Vector" << endl;
+            if (!writeOpticalFlow(output, floatFlow))
+                std::cout << "Failed to save Flow Vector" << std::endl;
            else
-                cout << "Flow vector saved as '" << output << "'\n";
+                std::cout << "Flow vector saved as '" << output << "'" << std::endl;
        }

        Mat planes[] = { flowx, flowy };
-        split(upsampledFlowXY, planes);
+        split(floatFlow, planes);
        flowx = planes[0]; flowy = planes[1];

        drawOpticalFlow(flowx, flowy, image, 10);

-        imshow("Colorize image",image);
+        imshow("Colorize image", image);
        waitKey(0);
        nvof->collectGarbage();
    }
--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
@ -183,8 +183,11 @@ int main(int argc, const char* argv[])
    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
-    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(frame0.size().width, frame0.size().height,
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof_1_0 = cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, false, false, false, 0, inputStream, outputStream);
+    Ptr<cuda::NvidiaOpticalFlow_2_0> nvof_2_0 = cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED, false, false, false, 0, inputStream, outputStream);

    {
        GpuMat d_frame0f;
@ -242,16 +245,32 @@ int main(int argc, const char* argv[])
        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
        const int64 start = getTickCount();

-        nvof->calc(d_frame0, d_frame1, d_flowxy);
+        nvof_1_0->calc(d_frame0, d_frame1, d_flowxy);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "NVIDIAOpticalFlow : " << timeSec << " sec" << endl;
+        cout << "NVIDIAOpticalFlow_1_0 : " << timeSec << " sec" << endl;

-        nvof->upSampler(d_flowxy, frame0.size().width, frame0.size().height,
-            nvof->getGridSize(), d_flow);
+        nvof_1_0->upSampler(d_flowxy, frame0.size(), nvof_1_0->getGridSize(), d_flow);

-        showFlow("NVIDIAOpticalFlow", d_flow);
-        nvof->collectGarbage();
+        showFlow("NVIDIAOpticalFlow_1_0", d_flow);
+        nvof_1_0->collectGarbage();
+    }
+
+    {
+        //The timing displayed below includes the time taken to copy the input buffers to the OF CUDA input buffers
+        //and to copy the output buffers from the OF CUDA output buffer to the output buffer.
+        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
+        const int64 start = getTickCount();
+
+        nvof_2_0->calc(d_frame0, d_frame1, d_flowxy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "NVIDIAOpticalFlow_2_0 : " << timeSec << " sec" << endl;
+
+        nvof_2_0->convertToFloat(d_flowxy, d_flow);
+
+        showFlow("NVIDIAOpticalFlow_2_0", d_flow);
+        nvof_2_0->collectGarbage();
    }

    imshow("Frame 0", frame0);
--- a/modules/cudaoptflow/src/cuda/nvidiaOpticalFlow.cu
+++ b/modules/cudaoptflow/src/cuda/nvidiaOpticalFlow.cu
@ -0,0 +1,99 @@
+//
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//M*/
+#if !defined CUDA_DISABLER
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int    uint32_t;
+typedef   signed short  int16_t;
+typedef   signed int    int32_t;
+
+#define BLOCKDIM_X 32
+#define BLOCKDIM_Y 16
+
+// data required to do 2x upsampling.  Same can be used for 4x upsampling also
+#define SMEM_COLS  ((BLOCKDIM_X)/2)
+#define SMEM_ROWS  ((BLOCKDIM_Y)/2)
+
+namespace cv { namespace cuda { namespace device { namespace optflow_nvidia
+{
+static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+    int const line) {
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<uint32_t>(result), _cudaGetErrorEnum(result), func);
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+template <class T>
+static __device__ void ReadDevPtrData(void* devptr, uint32_t x0, uint32_t y0, uint32_t src_w, uint32_t src_h, uint32_t src_pitch,
+                                      T src[][SMEM_COLS], uint32_t i, uint32_t j)
+{
+    uint32_t shift = (sizeof(T) == sizeof(int32_t)) ? 2 : 1;
+    src[j][i] = *(T*)((uint8_t*)devptr + y0 * src_pitch + (x0 << shift));
+}
+
+
+extern "C"
+__global__ void NearestNeighborFlowKernel(cudaSurfaceObject_t srcSurf, void* srcDevPtr, uint32_t src_w, uint32_t src_pitch, uint32_t src_h,
+                                          cudaSurfaceObject_t dstSurf, void* dstDevPtr, uint32_t dst_w, uint32_t dst_pitch, uint32_t dst_h,
+                                          uint32_t nScaleFactor)
+{
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int x0 = x / nScaleFactor;
+    int y0 = y / nScaleFactor;
+
+    __shared__ short2 src[SMEM_ROWS][SMEM_COLS];
+
+    int i = threadIdx.x / nScaleFactor;
+    int j = threadIdx.y / nScaleFactor;
+
+    if ((x % nScaleFactor == 0) && (y % nScaleFactor == 0))
+    {
+        ReadDevPtrData<short2>(srcDevPtr, x0, y0, src_w, src_h, src_pitch, src, i, j);
+    }
+    __syncthreads();
+
+    if (x < dst_w && y < dst_h)
+    {
+        if (dstDevPtr == NULL)
+        {
+            surf2Dwrite<short2>(src[j][i], dstSurf, x * sizeof(short2), y, cudaBoundaryModeClamp);
+        }
+        else
+        {
+            *(short2*)((uint8_t*)dstDevPtr + y * dst_pitch + (x << 2)) = src[j][i];
+        }
+    }
+}
+
+void FlowUpsample(void* srcDevPtr, uint32_t nSrcWidth, uint32_t nSrcPitch, uint32_t nSrcHeight,
+                  void* dstDevPtr, uint32_t nDstWidth, uint32_t nDstPitch, uint32_t nDstHeight,
+                  uint32_t nScaleFactor)
+{
+
+        dim3 blockDim(BLOCKDIM_X, BLOCKDIM_Y);
+        dim3 gridDim((nDstWidth + blockDim.x - 1) / blockDim.x, (nDstHeight + blockDim.y - 1) / blockDim.y);
+        NearestNeighborFlowKernel << <gridDim, blockDim >> > (0, srcDevPtr, nSrcWidth, nSrcPitch, nSrcHeight,
+            0, dstDevPtr, nDstWidth, nDstPitch, nDstHeight,
+            nScaleFactor);
+
+        checkCudaErrors(cudaGetLastError());
+}}}}}
+
+#endif
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
@ -8,11 +8,27 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) { throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create
+    (int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&) {
+    throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0>(); }
+
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> cv::cuda::NvidiaOpticalFlow_2_0::create(
+    int, int, NVIDIA_OF_PERF_LEVEL, NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE, NVIDIA_OF_HINT_VECTOR_GRID_SIZE,
+    bool, int, cv::Rect*, bool, bool, bool, int, Stream&, Stream&) {
+    throw_no_cuda(); return cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0>();
+}

 #elif !defined HAVE_NVIDIA_OPTFLOW

-cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&)
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
+    int, int, NVIDIA_OF_PERF_LEVEL, bool, bool, bool, int, Stream&, Stream&)
+{
+    CV_Error(cv::Error::HeaderIsNull, "OpenCV was build without NVIDIA OpticalFlow support");
+}
+
+cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> cv::cuda::NvidiaOpticalFlow_2_0::create(
+    int, int, NVIDIA_OF_PERF_LEVEL, NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE, NVIDIA_OF_HINT_VECTOR_GRID_SIZE,
+    bool, int, cv::Rect*, bool, bool, bool, int, , Stream&, Stream&)
 {
    CV_Error(cv::Error::HeaderIsNull, "OpenCV was build without NVIDIA OpticalFlow support");
 }
@ -22,6 +38,13 @@ cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create
 #include "nvOpticalFlowCommon.h"
 #include "nvOpticalFlowCuda.h"

+namespace cv { namespace cuda { namespace device { namespace optflow_nvidia
+{
+void FlowUpsample(void* srcDevPtr, uint32_t nSrcWidth, uint32_t nSrcPitch, uint32_t nSrcHeight,
+    void* dstDevPtr, uint32_t nDstWidth, uint32_t nDstPitch, uint32_t nDstHeight,
+    uint32_t nScaleFactor);
+}}}}
+
 #if defined(_WIN32) || defined(_WIN64)
 #include <Windows.h>
 #else
@ -123,7 +146,7 @@ class LoadNvidiaModules
 private:
    typedef int(*PFNCudaCuCtxGetCurrent)(CUcontext*);
    typedef NV_OF_STATUS(NVOFAPI *PFNNvOFAPICreateInstanceCuda)
-        (uint32_t apiVer, NV_OF_CUDA_API_FUNCTION_LIST* cudaOf);
+        (int apiVer, NV_OF_CUDA_API_FUNCTION_LIST* cudaOf);

    PFNCudaCuCtxGetCurrent m_cudaDriverAPIGetCurrentCtx;
    PFNNvOFAPICreateInstanceCuda m_NvOFAPICreateInstanceCuda;
@ -249,9 +272,9 @@ private:
    NV_OF_BUFFER_DESCRIPTOR m_hintBufferDesc;
    NV_OF_BUFFER_DESCRIPTOR m_costBufferDesc;

-    uint32_t m_outputElementSize;
-    uint32_t m_costBufElementSize;
-    uint32_t m_hintBufElementSize;
+    int m_outputElementSize;
+    int m_costBufElementSize;
+    int m_hintBufElementSize;

    NV_OF_INIT_PARAMS m_initParams;

@ -288,7 +311,7 @@ protected:
    std::mutex m_lock;

 public:
-    NvidiaOpticalFlowImpl(int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
+    NvidiaOpticalFlowImpl(cv::Size imageSize, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
        bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream);

    virtual void calc(InputArray inputImage, InputArray referenceImage,
@ -298,17 +321,17 @@ public:

    virtual void collectGarbage();

-    virtual void upSampler(InputArray flow, int width, int height,
+    virtual void upSampler(InputArray flow, cv::Size imageSize,
        int gridSize, InputOutputArray upsampledFlow);

    virtual int getGridSize() const { return m_gridSize; }
 };

 NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
-    int width, int height, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
+    cv::Size imageSize, NV_OF_PERF_LEVEL perfPreset, bool bEnableTemporalHints,
    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId,
    Stream inputStream, Stream outputStream) :
-    m_width(width), m_height(height), m_preset(perfPreset),
+    m_width(imageSize.width), m_height(imageSize.height), m_preset(perfPreset),
    m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints),
    m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints),
    m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId),
@ -371,7 +394,7 @@ NvidiaOpticalFlowImpl::NvidiaOpticalFlowImpl(
        m_costBufferDesc.height = nOutHeight;
        m_costBufferDesc.bufferFormat = NV_OF_BUFFER_FORMAT_UINT;
        m_costBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_COST;
-        m_costBufElementSize = sizeof(uint32_t);
+        m_costBufElementSize = sizeof(int);
    }

    m_ofAPI.reset(new NV_OF_CUDA_API_FUNCTION_LIST());
@ -600,7 +623,7 @@ void NvidiaOpticalFlowImpl::collectGarbage()
    }
 }

-void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
+void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, cv::Size imageSize,
    int gridSize, InputOutputArray upsampledFlow)
 {
    Mat flow;
@ -622,26 +645,26 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,

    std::unique_ptr<float[]> flowVectors = nullptr;
    const NV_OF_FLOW_VECTOR* _flowVectors = static_cast<const NV_OF_FLOW_VECTOR*>((const void*)flow.data);
-    flowVectors.reset(new float[2 * width * height]);
-    for (int y = 0; y < height; ++y)
+    flowVectors.reset(new float[2 * imageSize.width * imageSize.height]);
+    for (int y = 0; y < imageSize.height; ++y)
    {
-        for (int x = 0; x < width; ++x)
+        for (int x = 0; x < imageSize.width; ++x)
        {
-            uint32_t blockIdX = x / gridSize;
-            uint32_t blockIdY = y / gridSize;
-            uint32_t widthInBlocks = ((width + gridSize - 1) / gridSize);
-            uint32_t heightInBlocks = ((height + gridSize - 1) / gridSize);;
+            int blockIdX = x / gridSize;
+            int blockIdY = y / gridSize;
+            int widthInBlocks = ((imageSize.width + gridSize - 1) / gridSize);
+            int heightInBlocks = ((imageSize.height + gridSize - 1) / gridSize);;
            if ((blockIdX < widthInBlocks) && (blockIdY < heightInBlocks))
            {
-                flowVectors[(y * 2 * width) + 2 * x] = (float)
+                flowVectors[(y * 2 * imageSize.width) + 2 * x] = (float)
                    (_flowVectors[blockIdX + (blockIdY * widthInBlocks)].flowx / (float)(1 << 5));
-                flowVectors[(y * 2 * width) + 2 * x + 1] = (float)
+                flowVectors[(y * 2 * imageSize.width) + 2 * x + 1] = (float)
                    (_flowVectors[blockIdX + (blockIdY * widthInBlocks)].flowy / (float)(1 << 5));
            }
        }
    }

-    Mat output(Size(width, height), CV_32FC2, flowVectors.get());
+    Mat output(Size(imageSize.width, imageSize.height), CV_32FC2, flowVectors.get());
    if (upsampledFlow.isMat())
    {
        output.copyTo(upsampledFlow);
@ -656,18 +679,603 @@ void NvidiaOpticalFlowImpl::upSampler(InputArray _flow, int width, int height,
        CV_Error(Error::StsBadArg,
            "Incorrect flow buffer passed for upsampled flow. Pass either Mat or GpuMat");
    }
+}
+
+class NvidiaOpticalFlowImpl_2 : public cv::cuda::NvidiaOpticalFlow_2_0
+{
+private:
+    int m_width;
+    int m_height;
+    NV_OF_PERF_LEVEL m_preset;
+    NV_OF_OUTPUT_VECTOR_GRID_SIZE m_gridSize;
+    NV_OF_HINT_VECTOR_GRID_SIZE m_hintGridSize;
+    bool m_enableROI;
+    std::vector<Rect> m_roiDataRect;
+    NV_OF_ROI_RECT* m_roiData;
+    bool m_enableTemporalHints;
+    bool m_enableExternalHints;
+    bool m_enableCostBuffer;
+    int m_gpuId;
+    Stream m_inputStream;
+    Stream m_outputStream;
+
+    CUcontext m_cuContext;
+    int m_scaleFactor;
+    NV_OF_BUFFER_FORMAT m_format;
+    NV_OF_OUTPUT_VECTOR_GRID_SIZE m_hwGridSize;
+
+    NV_OF_BUFFER_DESCRIPTOR m_inputBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_outputBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_hintBufferDesc;
+    NV_OF_BUFFER_DESCRIPTOR m_costBufferDesc;
+
+    int m_outputElementSize;
+    int m_costBufElementSize;
+    int m_hintBufElementSize;
+
+    NV_OF_INIT_PARAMS m_initParams;
+
+    std::unique_ptr<NV_OF_CUDA_API_FUNCTION_LIST> m_ofAPI;
+    NvOFHandle m_hOF; //nvof handle
+
+    NvOFGPUBufferHandle m_hInputBuffer;
+    NvOFGPUBufferHandle m_hReferenceBuffer;
+    NvOFGPUBufferHandle m_hOutputBuffer;
+    NvOFGPUBufferHandle m_hOutputUpScaledBuffer;
+    NvOFGPUBufferHandle m_hHintBuffer;
+    NvOFGPUBufferHandle m_hCostBuffer;
+
+    CUdeviceptr m_frame0cuDevPtr;
+    CUdeviceptr m_frame1cuDevPtr;
+    CUdeviceptr m_flowXYcuDevPtr;
+    CUdeviceptr m_flowXYUpScaledcuDevPtr;
+    CUdeviceptr m_hintcuDevPtr;
+    CUdeviceptr m_costcuDevPtr;
+
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_inputBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_referenceBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_outputBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_outputUpScaledBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_hintBufferStrideInfo;
+    NV_OF_CUDA_BUFFER_STRIDE_INFO m_costBufferStrideInfo;
+
+    NV_OF_CUDA_API_FUNCTION_LIST* GetAPI()
+    {
+        std::lock_guard<std::mutex> lock(m_lock);
+        return  m_ofAPI.get();
+    }
+
+    NvOFHandle GetHandle() { return m_hOF; }
+
+protected:
+    std::mutex m_lock;
+
+public:
+    NvidiaOpticalFlowImpl_2(cv::Size imageSize, NV_OF_PERF_LEVEL perfPreset,
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize, NV_OF_HINT_VECTOR_GRID_SIZE hintGridSize,
+        bool bEnableROI, std::vector<Rect> roiData, bool bEnableTemporalHints,
+        bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream);
+
+    virtual void calc(InputArray inputImage, InputArray referenceImage,
+        InputOutputArray flow, Stream& stream = Stream::Null(),
+        InputArray hint = cv::noArray(), OutputArray cost = cv::noArray());
+
+    virtual void collectGarbage();
+
+    virtual void convertToFloat(InputArray flow, InputOutputArray floatFlow);
+
+    virtual int getGridSize() const { return m_gridSize; }
+};
+
+NvidiaOpticalFlowImpl_2::NvidiaOpticalFlowImpl_2(
+    cv::Size imageSize, NV_OF_PERF_LEVEL perfPreset,
+    NV_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize, NV_OF_HINT_VECTOR_GRID_SIZE hintGridSize,
+    bool bEnableROI, std::vector<Rect> roiData, bool bEnableTemporalHints,
+    bool bEnableExternalHints, bool bEnableCostBuffer, int gpuId, Stream inputStream, Stream outputStream) :
+    m_width(imageSize.width), m_height(imageSize.height), m_preset(perfPreset),
+    m_gridSize(outputGridSize), m_hintGridSize(hintGridSize),
+    m_enableROI(bEnableROI), m_roiDataRect(roiData),
+    m_enableTemporalHints((NV_OF_BOOL)bEnableTemporalHints),
+    m_enableExternalHints((NV_OF_BOOL)bEnableExternalHints),
+    m_enableCostBuffer((NV_OF_BOOL)bEnableCostBuffer), m_gpuId(gpuId),
+    m_inputStream(inputStream), m_outputStream(outputStream),
+    m_cuContext(nullptr), m_scaleFactor(1), m_format(NV_OF_BUFFER_FORMAT_GRAYSCALE8),
+    m_hwGridSize((NV_OF_OUTPUT_VECTOR_GRID_SIZE)0)
+{
+    LoadNvidiaModules& LoadNvidiaModulesObj = LoadNvidiaModules::Init();
+
+    int nGpu = 0;
+
+    cuSafeCall(cudaGetDeviceCount(&nGpu));
+    if (m_gpuId < 0 || m_gpuId >= nGpu)
+    {
+        CV_Error(Error::StsBadArg, "Invalid GPU Ordinal");
+    }
+
+    cuSafeCall(cudaSetDevice(m_gpuId));
+    cuSafeCall(cudaFree(m_cuContext));
+
+    cuSafeCall(LoadNvidiaModulesObj.GetCudaLibraryFunctionPtr()(&m_cuContext));
+
+    if (m_gridSize != (NV_OF_OUTPUT_VECTOR_GRID_SIZE)NV_OF_OUTPUT_VECTOR_GRID_SIZE_1 &&
+        m_gridSize != (NV_OF_OUTPUT_VECTOR_GRID_SIZE)NV_OF_OUTPUT_VECTOR_GRID_SIZE_2 &&
+        m_gridSize != (NV_OF_OUTPUT_VECTOR_GRID_SIZE)NV_OF_OUTPUT_VECTOR_GRID_SIZE_4)
+    {
+        CV_Error(Error::StsBadArg, "Unsupported output grid size");
+    }
+
+    if (m_enableExternalHints)
+    {
+        if (m_hintGridSize != (NV_OF_HINT_VECTOR_GRID_SIZE)NV_OF_HINT_VECTOR_GRID_SIZE_1 &&
+            m_hintGridSize != (NV_OF_HINT_VECTOR_GRID_SIZE)NV_OF_HINT_VECTOR_GRID_SIZE_2 &&
+            m_hintGridSize != (NV_OF_HINT_VECTOR_GRID_SIZE)NV_OF_HINT_VECTOR_GRID_SIZE_4 &&
+            m_hintGridSize != (NV_OF_HINT_VECTOR_GRID_SIZE)NV_OF_HINT_VECTOR_GRID_SIZE_8)
+        {
+            CV_Error(Error::StsBadArg, "Unsupported hint grid size");
+        }
+    }
+
+    m_ofAPI.reset(new NV_OF_CUDA_API_FUNCTION_LIST());
+
+    NVOF_API_CALL(LoadNvidiaModulesObj.GetOFLibraryFunctionPtr()(NV_OF_API_VERSION, m_ofAPI.get()));
+    NVOF_API_CALL(GetAPI()->nvCreateOpticalFlowCuda(m_cuContext, &m_hOF));
+
+    m_roiData = (NV_OF_ROI_RECT*)m_roiDataRect.data();
+
+    uint32_t size = 0;
+    if (m_enableROI)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORT_ROI, nullptr, &size));
+        std::unique_ptr<uint32_t[]> val1(new uint32_t[size]);
+        NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORT_ROI, val1.get(), &size));
+        if (val1[0] != NV_OF_TRUE)
+        {
+            m_enableROI = false;
+            m_roiData = nullptr;
+            CV_Error(Error::StsBadFunc, "ROI not supported on this GPU");
+        }
+    }
+    size = 0;
+    NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORTED_OUTPUT_GRID_SIZES, nullptr, &size));
+    std::unique_ptr<uint32_t[]> val2(new uint32_t[size]);
+    NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORTED_OUTPUT_GRID_SIZES, val2.get(), &size));
+    for (uint32_t i = 0; i < size; i++)
+    {
+        if (m_gridSize != val2[i])
+        {
+            size = 0;
+            NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORTED_OUTPUT_GRID_SIZES, nullptr, &size));
+            std::unique_ptr<uint32_t[]> val3(new uint32_t[size]);
+            NVOF_API_CALL(GetAPI()->nvOFGetCaps(GetHandle(), NV_OF_CAPS_SUPPORTED_OUTPUT_GRID_SIZES, val3.get(), &size));
+
+            m_hwGridSize = (NV_OF_OUTPUT_VECTOR_GRID_SIZE)NV_OF_OUTPUT_VECTOR_GRID_SIZE_MAX;
+            for (uint32_t i = 0; i < size; i++)
+            {
+                if (m_gridSize == val3[i])
+                {
+                    m_hwGridSize = m_gridSize;
+                    break;
+                }
+                if (m_gridSize < val3[i] && val3[i] < m_hwGridSize)
+                {
+                    m_hwGridSize = (NV_OF_OUTPUT_VECTOR_GRID_SIZE)val3[i];
+                }
+            }
+            if (m_hwGridSize >= (NV_OF_OUTPUT_VECTOR_GRID_SIZE)NV_OF_OUTPUT_VECTOR_GRID_SIZE_MAX)
+            {
+                CV_Error(Error::StsBadArg, "Invalid Grid Size");
+            }
+            else
+            {
+                m_scaleFactor = m_hwGridSize / m_gridSize;
+            }
+        }
+        else
+        {
+            m_hwGridSize = m_gridSize;
+        }
+    }
+
+    auto nOutWidth = (m_width + m_hwGridSize - 1) / m_hwGridSize;
+    auto nOutHeight = (m_height + m_hwGridSize - 1) / m_hwGridSize;
+
+    auto outBufFmt = NV_OF_BUFFER_FORMAT_SHORT2;
+
+    memset(&m_inputBufferDesc, 0, sizeof(m_inputBufferDesc));
+    m_inputBufferDesc.width = m_width;
+    m_inputBufferDesc.height = m_height;
+    m_inputBufferDesc.bufferFormat = m_format;
+    m_inputBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_INPUT;
+
+    memset(&m_outputBufferDesc, 0, sizeof(m_outputBufferDesc));
+    m_outputBufferDesc.width = nOutWidth;
+    m_outputBufferDesc.height = nOutHeight;
+    m_outputBufferDesc.bufferFormat = outBufFmt;
+    m_outputBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_OUTPUT;
+    m_outputElementSize = sizeof(NV_OF_FLOW_VECTOR);
+
+    if (m_enableExternalHints)
+    {
+        memset(&m_hintBufferDesc, 0, sizeof(m_hintBufferDesc));
+        m_hintBufferDesc.width = nOutWidth;
+        m_hintBufferDesc.height = nOutHeight;
+        m_hintBufferDesc.bufferFormat = outBufFmt;
+        m_hintBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_HINT;
+        m_hintBufElementSize = m_outputElementSize;
+    }
+
+    if (m_enableCostBuffer)
+    {
+        memset(&m_costBufferDesc, 0, sizeof(m_costBufferDesc));
+        m_costBufferDesc.width = nOutWidth;
+        m_costBufferDesc.height = nOutHeight;
+        m_costBufferDesc.bufferFormat = NV_OF_BUFFER_FORMAT_UINT8;
+        m_costBufferDesc.bufferUsage = NV_OF_BUFFER_USAGE_COST;
+        m_costBufElementSize = sizeof(int);
+    }
+
+    memset(&m_initParams, 0, sizeof(m_initParams));
+    m_initParams.width = m_inputBufferDesc.width;
+    m_initParams.height = m_inputBufferDesc.height;
+    m_initParams.enableExternalHints = (NV_OF_BOOL)m_enableExternalHints;
+    m_initParams.enableOutputCost = (NV_OF_BOOL)m_enableCostBuffer;
+    m_initParams.hintGridSize = (NV_OF_BOOL)m_enableExternalHints == NV_OF_TRUE ?
+        m_hintGridSize : (NV_OF_HINT_VECTOR_GRID_SIZE)NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED;
+    m_initParams.outGridSize = (NV_OF_OUTPUT_VECTOR_GRID_SIZE)m_hwGridSize;
+    m_initParams.mode = NV_OF_MODE_OPTICALFLOW;
+    m_initParams.perfLevel = m_preset;
+    m_initParams.enableRoi = (NV_OF_BOOL)m_enableROI;
+
+    NVOF_API_CALL(GetAPI()->nvOFInit(GetHandle(), &m_initParams));
+
+    if (m_inputStream || m_outputStream)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFSetIOCudaStreams(GetHandle(),
+            StreamAccessor::getStream(m_inputStream), StreamAccessor::getStream(m_outputStream)));
+    }
+
+    //Input Buffer 1
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hInputBuffer));
+    m_frame0cuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hInputBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hInputBuffer, &m_inputBufferStrideInfo));
+
+    //Input Buffer 2
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_inputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hReferenceBuffer));
+    m_frame1cuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hReferenceBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hReferenceBuffer, &m_referenceBufferStrideInfo));
+
+    //Output Buffer
+    NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+        &m_outputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hOutputBuffer));
+    m_flowXYcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hOutputBuffer);
+    NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+        m_hOutputBuffer, &m_outputBufferStrideInfo));
+
+    if (m_scaleFactor > 1)
+    {
+        m_outputBufferDesc.width = (m_width + m_gridSize - 1) / m_gridSize;;
+        m_outputBufferDesc.height = (m_height + m_gridSize - 1) / m_gridSize;;
+
+        //Output UpScaled Buffer
+        NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+            &m_outputBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hOutputUpScaledBuffer));
+        m_flowXYUpScaledcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hOutputUpScaledBuffer);
+        NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+            m_hOutputUpScaledBuffer, &m_outputUpScaledBufferStrideInfo));
+    }
+
+    //Hint Buffer
+    if (m_enableExternalHints)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+            &m_hintBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hHintBuffer));
+        m_hintcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hHintBuffer);
+        NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+            m_hHintBuffer, &m_hintBufferStrideInfo));
+    }
+
+    //Cost Buffer
+    if (m_enableCostBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFCreateGPUBufferCuda(GetHandle(),
+            &m_costBufferDesc, NV_OF_CUDA_BUFFER_TYPE_CUDEVICEPTR, &m_hCostBuffer));
+        m_costcuDevPtr = GetAPI()->nvOFGPUBufferGetCUdeviceptr(m_hCostBuffer);
+        NVOF_API_CALL(GetAPI()->nvOFGPUBufferGetStrideInfo(
+            m_hCostBuffer, &m_costBufferStrideInfo));
+    }
+}
+
+void NvidiaOpticalFlowImpl_2::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow,
+    Stream& stream, InputArray hint, OutputArray cost)
+{
+    CV_UNUSED(stream);
+    GpuMat frame0GpuMat(_frame0.size(), _frame0.type(), (void*)m_frame0cuDevPtr,
+        m_inputBufferStrideInfo.strideInfo[0].strideXInBytes);
+    GpuMat frame1GpuMat(_frame1.size(), _frame1.type(), (void*)m_frame1cuDevPtr,
+        m_referenceBufferStrideInfo.strideInfo[0].strideXInBytes);
+    GpuMat flowXYGpuMat(Size((m_width + m_hwGridSize - 1) / m_hwGridSize,
+        (m_height + m_hwGridSize - 1) / m_hwGridSize), CV_16SC2,
+        (void*)m_flowXYcuDevPtr, m_outputBufferStrideInfo.strideInfo[0].strideXInBytes);
+    GpuMat flowXYGpuMatUpScaled(Size((m_width + m_gridSize - 1) / m_gridSize,
+        (m_height + m_gridSize - 1) / m_gridSize), CV_16SC2,
+        (void*)m_flowXYUpScaledcuDevPtr, m_outputUpScaledBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+    //check whether frame0 is Mat or GpuMat
+    if (_frame0.isMat())
+    {
+        //Get Mats from InputArrays
+        Mat __frame0 = _frame0.getMat();
+        frame0GpuMat.upload(__frame0, m_inputStream);
+    }
+    else if (_frame0.isGpuMat())
+    {
+        //Get GpuMats from InputArrays
+        GpuMat __frame0 = _frame0.getGpuMat();
+        __frame0.copyTo(frame0GpuMat, m_inputStream);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect input. Pass input image (frame0) as Mat or GpuMat");
+    }
+
+    //check whether frame1 is Mat or GpuMat
+    if (_frame1.isMat())
+    {
+        //Get Mats from InputArrays
+        Mat __frame1 = _frame1.getMat();
+        frame1GpuMat.upload(__frame1, m_inputStream);
+    }
+    else if (_frame1.isGpuMat())
+    {
+        //Get GpuMats from InputArrays
+        GpuMat __frame1 = _frame1.getGpuMat();
+        __frame1.copyTo(frame1GpuMat, m_inputStream);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect input. Pass reference image (frame1) as Mat or GpuMat");
+    }
+
+    if (m_enableExternalHints)
+    {
+        GpuMat hintGpuMat(hint.size(), hint.type(), (void*)m_hintcuDevPtr,
+            m_hintBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+        if (hint.isMat())
+        {
+            //Get Mat from InputArray hint
+            Mat _hint = hint.getMat();
+            hintGpuMat.upload(_hint, m_inputStream);
+        }
+        else if (hint.isGpuMat())
+        {
+            //Get GpuMat from InputArray hint
+            GpuMat _hint = hint.getGpuMat();
+            _hint.copyTo(hintGpuMat, m_inputStream);
+        }
+        else
+        {
+            CV_Error(Error::StsBadArg, "Incorrect hint buffer passed. Pass Mat or GpuMat");
+        }
+    }
+
+    //Execute Call
+    NV_OF_EXECUTE_INPUT_PARAMS exeInParams;
+    NV_OF_EXECUTE_OUTPUT_PARAMS exeOutParams;
+    memset(&exeInParams, 0, sizeof(exeInParams));
+    exeInParams.inputFrame = m_hInputBuffer;
+    exeInParams.referenceFrame = m_hReferenceBuffer;
+    exeInParams.disableTemporalHints = (NV_OF_BOOL)m_enableTemporalHints == NV_OF_TRUE ?
+        NV_OF_FALSE : NV_OF_TRUE;
+    exeInParams.externalHints = m_initParams.enableExternalHints == NV_OF_TRUE ?
+        m_hHintBuffer : nullptr;
+    exeInParams.numRois = m_initParams.enableRoi == NV_OF_TRUE ? m_roiDataRect.size() : 0;
+    exeInParams.roiData = m_initParams.enableRoi == NV_OF_TRUE ? m_roiData : nullptr;
+    memset(&exeOutParams, 0, sizeof(exeOutParams));
+    exeOutParams.outputBuffer = m_hOutputBuffer;
+    exeOutParams.outputCostBuffer = m_initParams.enableOutputCost == NV_OF_TRUE ?
+        m_hCostBuffer : nullptr;
+    NVOF_API_CALL(GetAPI()->nvOFExecute(GetHandle(), &exeInParams, &exeOutParams));
+
+    if (m_scaleFactor > 1)
+    {
+        uint32_t nSrcWidth = flowXYGpuMat.size().width;
+        uint32_t nSrcHeight = flowXYGpuMat.size().height;
+        uint32_t nSrcPitch = m_outputBufferStrideInfo.strideInfo[0].strideXInBytes;
+        uint32_t nDstWidth = flowXYGpuMatUpScaled.size().width;
+        uint32_t nDstHeight = flowXYGpuMatUpScaled.size().height;
+        uint32_t nDstPitch = m_outputUpScaledBufferStrideInfo.strideInfo[0].strideXInBytes;
+        cv::cuda::device::optflow_nvidia::FlowUpsample((void*)m_flowXYcuDevPtr, nSrcWidth, nSrcPitch,
+            nSrcHeight, (void*)m_flowXYUpScaledcuDevPtr, nDstWidth, nDstPitch, nDstHeight, m_scaleFactor);
+
+        if (_flow.isMat())
+            flowXYGpuMatUpScaled.download(_flow, m_outputStream);
+        else if (_flow.isGpuMat())
+            flowXYGpuMatUpScaled.copyTo(_flow, m_outputStream);
+        else
+            CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat");
+    }
+    else
+    {
+        if (_flow.isMat())
+            flowXYGpuMat.download(_flow, m_outputStream);
+        else if (_flow.isGpuMat())
+            flowXYGpuMat.copyTo(_flow, m_outputStream);
+        else
+            CV_Error(Error::StsBadArg, "Incorrect flow buffer passed. Pass Mat or GpuMat");
+    }
+
+    if (m_enableCostBuffer)
+    {
+        GpuMat costGpuMat(Size((m_width + m_hwGridSize - 1) / m_hwGridSize,
+            (m_height + m_hwGridSize - 1) / m_hwGridSize), CV_8SC1, (void*)m_costcuDevPtr,
+            m_costBufferStrideInfo.strideInfo[0].strideXInBytes);
+
+        if (cost.isMat())
+            costGpuMat.download(cost, m_outputStream);
+        else if (cost.isGpuMat())
+            costGpuMat.copyTo(cost, m_outputStream);
+        else
+            CV_Error(Error::StsBadArg, "Incorrect cost buffer passed. Pass Mat or GpuMat");
+    }
+    m_outputStream.waitForCompletion();
+}
+
+void NvidiaOpticalFlowImpl_2::collectGarbage()
+{
+    if (m_enableROI)
+    {
+        m_roiData = nullptr;
+    }
+    if (m_hInputBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hInputBuffer));
+    }
+    if (m_hReferenceBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hReferenceBuffer));
+    }
+    if (m_hOutputBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hOutputBuffer));
+    }
+    if (m_scaleFactor > 1 && m_hOutputUpScaledBuffer)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hOutputUpScaledBuffer));
+    }
+    if (m_enableExternalHints)
+    {
+        if (m_hHintBuffer)
+        {
+            NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hHintBuffer));
+        }
+    }
+    if (m_enableCostBuffer)
+    {
+        if (m_hCostBuffer)
+        {
+            NVOF_API_CALL(GetAPI()->nvOFDestroyGPUBufferCuda(m_hCostBuffer));
+        }
+    }
+    if (m_inputStream)
+    {
+        m_inputStream.waitForCompletion();
+    }
+    if (m_outputStream)
+    {
+        m_outputStream.waitForCompletion();
+    }
+    if (m_hOF)
+    {
+        NVOF_API_CALL(GetAPI()->nvOFDestroy(m_hOF));
+    }
+}
+
+void NvidiaOpticalFlowImpl_2::convertToFloat(InputArray _flow, InputOutputArray floatFlow)
+{
+    Mat flow;
+    if (_flow.isMat())
+    {
+        Mat __flow = _flow.getMat();
+        __flow.copyTo(flow);
+    }
+    else if (_flow.isGpuMat())
+    {
+        GpuMat __flow = _flow.getGpuMat();
+        __flow.download(flow);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect flow buffer passed. Pass either Mat or GpuMat");
+    }
+
+    int width = flow.size().width;
+    int height = flow.size().height;
+
+    Mat output(Size(width, height), CV_32FC2);
+    for (int y = 0; y < height; ++y)
+    {
+        for (int x = 0; x < (int)(width * sizeof(int16_t)); ++x)
+        {
+            output.at<float>(y, x) = (float)(flow.at<int16_t>(y, x) / (float)(1 << 5));
+        }
+    }
+
+    if (floatFlow.isMat())
+    {
+        output.copyTo(floatFlow);
+    }
+    else if (floatFlow.isGpuMat())
+    {
+        GpuMat _output(output);
+        _output.copyTo(floatFlow);
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg,
+            "Incorrect flow buffer passed for upsampled flow. Pass either Mat or GpuMat");
+    }
 }}

 Ptr<cv::cuda::NvidiaOpticalFlow_1_0> cv::cuda::NvidiaOpticalFlow_1_0::create(
-    int width, int height, NVIDIA_OF_PERF_LEVEL perfPreset,
+    cv::Size imageSize, NVIDIA_OF_PERF_LEVEL perfPreset,
    bool bEnableTemporalHints, bool bEnableExternalHints,
    bool bEnableCostBuffer, int gpuId,
    Stream& inputStream, Stream& outputStream)
 {
    return makePtr<NvidiaOpticalFlowImpl>(
-        width,
-        height,
+        imageSize,
+        (NV_OF_PERF_LEVEL)perfPreset,
+        bEnableTemporalHints,
+        bEnableExternalHints,
+        bEnableCostBuffer,
+        gpuId,
+        inputStream,
+        outputStream);
+}
+
+Ptr<cv::cuda::NvidiaOpticalFlow_2_0> cv::cuda::NvidiaOpticalFlow_2_0::create(
+    cv::Size imageSize, NVIDIA_OF_PERF_LEVEL perfPreset,
+    NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize, NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize,
+    bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer,
+    int gpuId, Stream& inputStream, Stream& outputStream)
+{
+    std::vector<Rect> roi(0);
+    return makePtr<NvidiaOpticalFlowImpl_2>(
+        imageSize,
+        (NV_OF_PERF_LEVEL)perfPreset,
+        (NV_OF_OUTPUT_VECTOR_GRID_SIZE)outputGridSize,
+        (NV_OF_HINT_VECTOR_GRID_SIZE)hintGridSize,
+        false,
+        roi,
+        bEnableTemporalHints,
+        bEnableExternalHints,
+        bEnableCostBuffer,
+        gpuId,
+        inputStream,
+        outputStream);
+}
+
+Ptr<cv::cuda::NvidiaOpticalFlow_2_0> cv::cuda::NvidiaOpticalFlow_2_0::create(
+  cv::Size imageSize, std::vector<Rect> roiData, NVIDIA_OF_PERF_LEVEL perfPreset,
+  NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize, NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize,
+  bool bEnableTemporalHints, bool bEnableExternalHints, bool bEnableCostBuffer,
+  int gpuId, Stream& inputStream, Stream& outputStream)
+{
+    return makePtr<NvidiaOpticalFlowImpl_2>(
+        imageSize,
        (NV_OF_PERF_LEVEL)perfPreset,
+        (NV_OF_OUTPUT_VECTOR_GRID_SIZE)outputGridSize,
+        (NV_OF_HINT_VECTOR_GRID_SIZE)hintGridSize,
+        true,
+        roiData,
        bEnableTemporalHints,
        bEnableExternalHints,
        bEnableCostBuffer,
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@ -495,19 +495,11 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(frame1.empty());

-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
-
    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
    try
    {
-        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
-            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
-            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
    }
    catch (const cv::Exception& e)
    {
@ -519,7 +511,7 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)

    Mat flow, upsampledFlow;
    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
-    d_nvof->upSampler(flow, width, height, gridSize, upsampledFlow);
+    d_nvof->upSampler(flow, frame0.size(), gridSize, upsampledFlow);

    std::string fname(cvtest::TS::ptr()->get_data_path());
    fname += "opticalflow/nvofGolden.flo";
@ -527,6 +519,7 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
    ASSERT_FALSE(golden.empty());

    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+    d_nvof->collectGarbage();
 }

 CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)
@ -539,19 +532,11 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)

    cv::Mat r_frame0, r_frame1;

-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
-
    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
    try
    {
-        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
-            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
-            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
    }
    catch (const cv::Exception& e)
    {
@ -569,9 +554,96 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)

    EXPECT_TRUE(cv::checkRange(flowx));
    EXPECT_TRUE(cv::checkRange(flowy));
+    d_nvof->collectGarbage();
 };

 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_1_0, ALL_DEVICES);

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_2_0
+
+struct NvidiaOpticalFlow_2_0 : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(NvidiaOpticalFlow_2_0, Regression)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, upsampledFlow;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+    d_nvof->convertToFloat(flow, upsampledFlow);
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    fname += "opticalflow/nvofGolden_2.flo";
+    cv::Mat golden = cv::readOpticalFlow(fname.c_str());
+    ASSERT_FALSE(golden.empty());
+
+    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+    d_nvof->collectGarbage();
+}
+
+CUDA_TEST_P(NvidiaOpticalFlow_2_0, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, flowx, flowy;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    Mat planes[] = { flowx, flowy };
+    split(flow, planes);
+    flowx = planes[0]; flowy = planes[1];
+
+    EXPECT_TRUE(cv::checkRange(flowx));
+    EXPECT_TRUE(cv::checkRange(flowy));
+    d_nvof->collectGarbage();
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_2_0, ALL_DEVICES);
+
 }} // namespace
 #endif // HAVE_CUDA