From 233612efd7925bd022777d297fdf65215f16dcc8 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Fri, 8 Apr 2016 16:03:51 +0300
Subject: [PATCH] Reworked HAL dft/dct interface, added replacement
 documentation

---
 modules/core/include/opencv2/core/hal/hal.hpp |  32 +-
 .../core/include/opencv2/core/hal/interface.h |  25 +-
 modules/core/src/dxt.cpp                      | 326 ++++++++----------
 modules/core/src/hal_replacement.hpp          | 105 +++++-
 modules/imgproc/src/templmatch.cpp            |  19 +-
 5 files changed, 273 insertions(+), 234 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 6b9f93dbff..5b01cbe4cd 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -187,24 +187,28 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
-struct DftContext
+struct CV_EXPORTS DFT1D
 {
-    void * impl;
-    bool useReplacement;
-    DftContext() : impl(0), useReplacement(false) {}
+    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
+    virtual void apply(const uchar *src, uchar *dst) = 0;
+    virtual ~DFT1D() {}
 };
 
-CV_EXPORTS void dftInit2D(DftContext & c, int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows = 0);
-CV_EXPORTS void dft2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
-CV_EXPORTS void dftFree2D(DftContext & c);
-
-CV_EXPORTS void dftInit1D(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
-CV_EXPORTS void dft1D(const DftContext & c, const void * src, void * dst);
-CV_EXPORTS void dftFree1D(DftContext & c);
+struct CV_EXPORTS DFT2D
+{
+    static Ptr<DFT2D> create(int width, int height, int depth,
+                             int src_channels, int dst_channels,
+                             int flags, int nonzero_rows = 0);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DFT2D() {}
+};
 
-CV_EXPORTS void dctInit2D(DftContext & c, int width, int height, int depth, int flags);
-CV_EXPORTS void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
-CV_EXPORTS void dctFree2D(DftContext & c);
+struct CV_EXPORTS DCT2D
+{
+    static Ptr<DCT2D> create(int width, int height, int depth, int flags);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DCT2D() {}
+};
 
 //! @} core_hal
 
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index 0da68f18cd..2bb7b19f21 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -11,21 +11,11 @@
 #define CV_HAL_ERROR_UNKNOWN -1
 //! @}
 
-
-#define CV_HAL_DFT_INVERSE        1
-#define CV_HAL_DFT_SCALE          2
-#define CV_HAL_DFT_ROWS           4
-#define CV_HAL_DFT_COMPLEX_OUTPUT 16
-#define CV_HAL_DFT_REAL_OUTPUT    32
-#define CV_HAL_DFT_TWO_STAGE      64
-#define CV_HAL_DFT_STAGE_COLS    128
-#define CV_HAL_DFT_IS_CONTINUOUS 512
-#define CV_HAL_DFT_IS_INPLACE 1024
-
 #ifdef __cplusplus
 #include <cstddef>
 #else
 #include <stddef.h>
+#include <stdbool.h>
 #endif
 
 //! @name Data types
@@ -155,6 +145,19 @@ typedef signed char schar;
 #define CV_HAL_BORDER_ISOLATED 16
 //! @}
 
+//! @name DFT flags
+//! @{
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+//! @}
+
 //! @}
 
 #endif
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 1ea5496753..2cff51d5a3 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -1553,7 +1553,7 @@ class Dft_C_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_C_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_C_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1617,7 +1617,7 @@ public:
     }
 
 private:
-    uchar * src;
+    const uchar * src;
     int src_step;
     uchar * dst;
     int dst_step;
@@ -1634,7 +1634,7 @@ class Dft_R_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_R_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_R_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1698,7 +1698,7 @@ public:
     }
 
 private:
-    uchar * src;
+    const uchar * src;
     int src_step;
     uchar * dst;
     int dst_step;
@@ -1711,7 +1711,7 @@ private:
 };
 
 template <typename Dft>
-bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_C_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_C_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1719,7 +1719,7 @@ bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int wid
 }
 
 template <typename Dft>
-bool Dft_R_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_R_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_R_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1750,7 +1750,7 @@ private:
     ippiDFT_R_Func func;
 };
 
-static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_C_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1804,7 +1804,7 @@ static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step,
     return false;
 }
 
-static bool ippi_DFT_R_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_R_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -2611,11 +2611,11 @@ inline DftDims determineDims(int rows, int cols, bool isRowWise, bool isContinuo
     return InvalidDim;
 }
 
-class OcvDftImpl
+class OcvDftImpl : public hal::DFT2D
 {
 protected:
-    hal::DftContext contextA;
-    hal::DftContext contextB;
+    Ptr<hal::DFT1D> contextA;
+    Ptr<hal::DFT1D> contextB;
     bool needBufferA;
     bool needBufferB;
     bool inv;
@@ -2763,7 +2763,7 @@ public:
                     count = height;
                 }
                 needBufferA = isInplace;
-                hal::dftInit1D(contextA, len, count, depth, f, &needBufferA);
+                contextA = hal::DFT1D::create(len, count, depth, f, &needBufferA);
                 if (needBufferA)
                     tmp_bufA.allocate(len * complex_elem_size);
             }
@@ -2773,7 +2773,7 @@ public:
                 count = width;
                 f |= CV_HAL_DFT_STAGE_COLS;
                 needBufferB = isInplace;
-                hal::dftInit1D(contextB, len, count, depth, f, &needBufferB);
+                contextB = hal::DFT1D::create(len, count, depth, f, &needBufferB);
                 if (needBufferB)
                     tmp_bufB.allocate(len * complex_elem_size);
 
@@ -2783,7 +2783,7 @@ public:
         }
     }
 
-    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    void apply(const uchar * src, size_t src_step, uchar * dst, size_t dst_step)
     {
 #if defined USE_IPP_DFT
         if (useIpp)
@@ -2860,17 +2860,9 @@ public:
         }
     }
 
-    void free()
-    {
-        if (useIpp)
-            return;
-        hal::dftFree1D(contextA);
-        hal::dftFree1D(contextB);
-    }
-
 protected:
 
-    void rowDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
+    void rowDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
     {
         int len, count;
         if (width == 1 && !isRowTransform )
@@ -2909,7 +2901,7 @@ protected:
             if( needBufferA )
                 dptr = tmp_bufA;
 
-            hal::dft1D(contextA, sptr, dptr);
+            contextA->apply(sptr, dptr);
 
             if( needBufferA )
                 memcpy( dptr0, dptr + dptr_offset, dst_full_len );
@@ -2924,7 +2916,7 @@ protected:
             complementComplexOutput(depth, dst_data, dst_step, len, nz, 1);
     }
 
-    void colDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
+    void colDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
     {
         int len = height;
         int count = width;
@@ -2983,8 +2975,8 @@ protected:
             }
 
             if( even )
-                hal::dft1D(contextB, buf1, dbuf1);
-            hal::dft1D(contextB, buf0, dbuf0);
+                contextB->apply(buf1, dbuf1);
+            contextB->apply(buf0, dbuf0);
 
             if( stage_dst_channels == 1 )
             {
@@ -3032,12 +3024,12 @@ protected:
             if( i+1 < b )
             {
                 CopyFrom2Columns( sptr0, src_step, buf0, buf1, len, complex_elem_size );
-                hal::dft1D(contextB, buf1, dbuf1);
+                contextB->apply(buf1, dbuf1);
             }
             else
                 CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
 
-            hal::dft1D(contextB, buf0, dbuf0);
+            contextB->apply(buf0, dbuf0);
 
             if( i+1 < b )
                 CopyTo2Columns( dbuf0, dbuf1, dptr0, dst_step, len, complex_elem_size );
@@ -3051,7 +3043,7 @@ protected:
     }
 };
 
-class OcvDftBasicImpl
+class OcvDftBasicImpl : public hal::DFT1D
 {
 public:
     OcvDftOptions opt;
@@ -3068,11 +3060,6 @@ public:
     {
         opt.factors = _factors;
     }
-    OcvDftBasicImpl & operator=(const OcvDftBasicImpl & other)
-    {
-        this->opt = other.opt;
-        return *this;
-    }
     void init(int len, int count, int depth, int flags, bool *needBuffer)
     {
         int prev_len = opt.n;
@@ -3211,7 +3198,7 @@ public:
         }
     }
 
-    void run(const void * src, void * dst)
+    void apply(const uchar *src, uchar *dst)
     {
         opt.dft_func(opt, src, dst);
     }
@@ -3219,126 +3206,113 @@ public:
     void free() {}
 };
 
-namespace hal {
-
-//================== 1D ======================
-
-void dftInit1D(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
+struct ReplacementDFT1D : public hal::DFT1D
 {
-    int res = cv_hal_dftInit1D(&context.impl, len, count, depth, flags, needBuffer);
-    if (res == CV_HAL_ERROR_OK)
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDFT1D() : context(0), isInitialized(false) {}
+    bool init(int len, int count, int depth, int flags, bool *needBuffer)
     {
-        context.useReplacement = true;
-        return;
+        int res = cv_hal_dftInit1D(&context, len, count, depth, flags, needBuffer);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
     }
-
-    context.useReplacement = false;
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    if (!c)
+    void apply(const uchar *src, uchar *dst)
     {
-        c = new OcvDftBasicImpl();
-        context.impl = (void*)c;
+        if (isInitialized)
+        {
+            CALL_HAL(dft1D, cv_hal_dft1D, context, src, dst);
+        }
     }
-    c->init(len, count, depth, flags, needBuffer);
-}
-
-void dft1D(const DftContext & context, const void * src, void * dst)
-{
-    if (context.useReplacement)
+    ~ReplacementDFT1D()
     {
-        int res = cv_hal_dft1D(context.impl, src, dst);
-        if (res != CV_HAL_ERROR_OK)
+        if (isInitialized)
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun");
+            CALL_HAL(dftFree1D, cv_hal_dftFree1D, context);
         }
-        return;
     }
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    c->run(src, dst);
-}
+};
 
-void dftFree1D(DftContext & context)
+struct ReplacementDFT2D : public hal::DFT2D
 {
-    if (context.useReplacement)
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDFT2D() : context(0), isInitialized(false) {}
+    bool init(int width, int height, int depth,
+              int src_channels, int dst_channels,
+              int flags, int nonzero_rows)
+    {
+        int res = cv_hal_dftInit2D(&context, width, height, depth, src_channels, dst_channels, flags, nonzero_rows);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
+    }
+    void apply(const uchar *src, size_t src_step, uchar *dst, size_t dst_step)
     {
-        int res = cv_hal_dftFree1D(context.impl);
-        if (res != CV_HAL_ERROR_OK)
+        if (isInitialized)
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree");
+            CALL_HAL(dft2D, cv_hal_dft2D, context, src, src_step, dst, dst_step);
         }
-        return;
     }
-
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    if (c)
+    ~ReplacementDFT2D()
     {
-        c->free();
-        delete c;
-        context.impl = 0;
+        if (isInitialized)
+        {
+            CALL_HAL(dftFree2D, cv_hal_dftFree1D, context);
+        }
     }
-}
+};
 
+namespace hal {
 
-//================== 2D ======================
+//================== 1D ======================
 
-void dftInit2D(DftContext & c,
-             int _width, int _height, int _depth, int _src_channels, int _dst_channels,
-             int flags,
-             int _nonzero_rows)
+Ptr<DFT1D> DFT1D::create(int len, int count, int depth, int flags, bool *needBuffer)
 {
-    int res = cv_hal_dftInit2D(&c.impl, _width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
-    if (res == CV_HAL_ERROR_OK)
     {
-        c.useReplacement = true;
-        return;
+        ReplacementDFT1D *impl = new ReplacementDFT1D();
+        if (impl->init(len, count, depth, flags, needBuffer))
+        {
+            return Ptr<DFT1D>(impl);
+        }
+        delete impl;
+    }
+    {
+        OcvDftBasicImpl *impl = new OcvDftBasicImpl();
+        impl->init(len, count, depth, flags, needBuffer);
+        return Ptr<DFT1D>(impl);
     }
-    c.useReplacement = false;
-
-    if( _width == 1 && _nonzero_rows > 0 )
-        CV_Error( CV_StsNotImplemented,
-        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
-        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
-
-    OcvDftImpl * d = new OcvDftImpl();
-    d->init(_width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
-    c.impl = (void*)d;
 }
 
-void dft2D(const DftContext & c,
-         const void * src, int src_step, void * dst, int dst_step)
+//================== 2D ======================
+
+Ptr<DFT2D> DFT2D::create(int width, int height, int depth,
+                         int src_channels, int dst_channels,
+                         int flags, int nonzero_rows)
 {
-    if (c.useReplacement)
     {
-        int res = cv_hal_dft2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
-        if (res != CV_HAL_ERROR_OK)
+        ReplacementDFT2D *impl = new ReplacementDFT2D();
+        if (impl->init(width, height, depth, src_channels, dst_channels, flags, nonzero_rows))
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun2D");
+            return Ptr<DFT2D>(impl);
         }
-        return;
+        delete impl;
     }
-    OcvDftImpl * d = (OcvDftImpl*)c.impl;
-    d->run((uchar*)src, src_step, (uchar*)dst, dst_step);
-}
-
-void dftFree2D(DftContext & c)
-{
-    if (c.useReplacement)
     {
-        int res = cv_hal_dftFree2D(c.impl);
-        if (res != CV_HAL_ERROR_OK)
+        if(width == 1 && nonzero_rows > 0 )
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree2D");
+            CV_Error( CV_StsNotImplemented,
+            "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
+            "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
         }
-        return;
+        OcvDftImpl *impl = new OcvDftImpl();
+        impl->init(width, height, depth, src_channels, dst_channels, flags, nonzero_rows);
+        return Ptr<DFT2D>(impl);
     }
-    OcvDftImpl * d = (OcvDftImpl*)c.impl;
-    d->free();
-    delete d;
-    c.impl = 0;
 }
 
 } // cv::hal::
-
 } // cv::
 
 
@@ -3382,10 +3356,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
         f |= CV_HAL_DFT_SCALE;
     if (src.data == dst.data)
         f |= CV_HAL_DFT_IS_INPLACE;
-    hal::DftContext c;
-    hal::dftInit2D(c, src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
-    hal::dft2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
-    hal::dftFree2D(c);
+    Ptr<hal::DFT2D> c = hal::DFT2D::create(src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
+    c->apply(src.data, src.step, dst.data, dst.step);
 }
 
 
@@ -3607,7 +3579,7 @@ namespace cv
    http://www.ece.utexas.edu/~bevans/courses/ee381k/lectures/09_DCT/lecture9/:
 */
 template<typename T> static void
-DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+DCT( const OcvDftOptions & c, const T* src, size_t src_step, T* dft_src, T* dft_dst, T* dst, size_t dst_step,
      const Complex<T>* dct_wave )
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
@@ -3650,7 +3622,7 @@ DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst
 
 
 template<typename T> static void
-IDCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+IDCT( const OcvDftOptions & c, const T* src, size_t src_step, T* dft_src, T* dft_dst, T* dst, size_t dst_step,
       const Complex<T>* dct_wave)
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
@@ -3768,29 +3740,29 @@ DCTInit( int n, int elem_size, void* _wave, int inv )
 }
 
 
-typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, int src_step, void* dft_src,
-                        void* dft_dst, void* dst, int dst_step, const void* dct_wave);
+typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, size_t src_step, void* dft_src,
+                        void* dft_dst, void* dst, size_t dst_step, const void* dct_wave);
 
-static void DCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, const Complexf* dct_wave)
+static void DCT_32f(const OcvDftOptions & c, const float* src, size_t src_step, float* dft_src, float* dft_dst,
+                    float* dst, size_t dst_step, const Complexf* dct_wave)
 {
     DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, const Complexf* dct_wave)
+static void IDCT_32f(const OcvDftOptions & c, const float* src, size_t src_step, float* dft_src, float* dft_dst,
+                    float* dst, size_t dst_step, const Complexf* dct_wave)
 {
     IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void DCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
-                    double* dst, int dst_step, const Complexd* dct_wave)
+static void DCT_64f(const OcvDftOptions & c, const double* src, size_t src_step, double* dft_src, double* dft_dst,
+                    double* dst, size_t dst_step, const Complexd* dct_wave)
 {
     DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
-                     double* dst, int dst_step, const Complexd* dct_wave)
+static void IDCT_64f(const OcvDftOptions & c, const double* src, size_t src_step, double* dft_src, double* dft_dst,
+                     double* dst, size_t dst_step, const Complexd* dct_wave)
 {
     IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
@@ -4058,7 +4030,7 @@ static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_s
 
 namespace cv {
 
-class OcvDctImpl
+class OcvDctImpl : public hal::DCT2D
 {
 public:
     OcvDftOptions opt;
@@ -4110,7 +4082,7 @@ public:
             end_stage = 1;
         }
     }
-    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    void apply(const uchar *src, size_t src_step, uchar *dst, size_t dst_step)
     {
         CV_IPP_RUN(IPP_VERSION_X100 >= 700 && depth == CV_32F, ippi_DCT_32f(src, src_step, dst, dst_step, width, height, isInverse, isRowTransform))
 
@@ -4183,69 +4155,65 @@ public:
                 prev_len = len;
             }
             // otherwise reuse the tables calculated on the previous stage
-            for(int i = 0; i < count; i++ )
+            for(unsigned i = 0; i < static_cast<unsigned>(count); i++ )
             {
-                dct_func( opt, sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
-                          dptr + i*dstep0, (int)dstep1, dct_wave);
+                dct_func( opt, sptr + i*sstep0, sstep1, src_dft_buf, dst_dft_buf,
+                          dptr + i*dstep0, dstep1, dct_wave);
             }
             src = dst;
             src_step = dst_step;
         }
-
     }
-    void free() {}
 };
 
-namespace hal {
-
-void dctInit2D(DftContext & c, int width, int height, int depth, int flags)
+struct ReplacementDCT2D : public hal::DCT2D
 {
-    int res = cv_hal_dctInit2D(&c.impl, width, height, depth, flags);
-    if (res == CV_HAL_ERROR_OK)
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDCT2D() : context(0), isInitialized(false) {}
+    bool init(int width, int height, int depth, int flags)
     {
-        c.useReplacement = true;
-        return;
+        int res = hal_ni_dctInit2D(&context, width, height, depth, flags);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
     }
-    c.useReplacement = false;
-    OcvDctImpl * impl = new OcvDctImpl();
-    impl->init(width, height, depth, flags);
-    c.impl = impl;
-}
-
-void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
-{
-    if (c.useReplacement)
+    void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step)
     {
-        int res = cv_hal_dct2D(c.impl, src, src_step, dst, dst_step);
-        if (res != CV_HAL_ERROR_OK)
+        if (isInitialized)
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctRun");
+            CALL_HAL(dct2D, cv_hal_dct2D, context, src_data, src_step, dst_data, dst_step);
         }
-        return;
     }
-    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
-    impl->run((uchar*)src, src_step, (uchar*)dst, dst_step);
-}
+    ~ReplacementDCT2D()
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dctFree2D, cv_hal_dctFree2D, context);
+        }
+    }
+};
+
+namespace hal {
 
-void dctFree2D(DftContext & c)
+Ptr<DCT2D> DCT2D::create(int width, int height, int depth, int flags)
 {
-    if (c.useReplacement)
     {
-        int res = cv_hal_dctFree2D(c.impl);
-        if (res != CV_HAL_ERROR_OK)
+        ReplacementDCT2D *impl = new ReplacementDCT2D();
+        if (impl->init(width, height, depth, flags))
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctFree");
+            return Ptr<DCT2D>(impl);
         }
-        return;
+        delete impl;
+    }
+    {
+        OcvDctImpl *impl = new OcvDctImpl();
+        impl->init(width, height, depth, flags);
+        return Ptr<DCT2D>(impl);
     }
-    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
-    impl->free();
-    delete impl;
-    c.impl = 0;
 }
 
 } // cv::hal::
-
 } // cv::
 
 void cv::dct( InputArray _src0, OutputArray _dst, int flags )
@@ -4265,10 +4233,8 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
     if (src.isContinuous() && dst.isContinuous())
         f |= CV_HAL_DFT_IS_CONTINUOUS;
 
-    hal::DftContext c;
-    hal::dctInit2D(c, src.cols, src.rows, depth, f);
-    hal::dct2D(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
-    hal::dctFree2D(c);
+    Ptr<hal::DCT2D> c = hal::DCT2D::create(src.cols, src.rows, depth, f);
+    c->apply(src.data, src.step, dst.data, dst.step);
 }
 
 
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index bbf32f39d8..93476c4594 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -376,38 +376,109 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #define cv_hal_merge64s hal_ni_merge64s
 //! @endcond
 
-//! @}
-
-#if defined __GNUC__
-#  pragma GCC diagnostic pop
-#elif defined _MSC_VER
-#  pragma warning( pop )
-#endif
+/**
+@brief Dummy structure storing DFT/DCT context
+
+Users can convert this pointer to any type they want. Initialisation and destruction should be made in Init and Free function implementations correspondingly.
+Example:
+@code{.cpp}
+int my_hal_dftInit2D(cvhalDFT **context, ...) {
+    *context = static_cast<cvhalDFT*>(new MyFilterData());
+    //... init
+}
+
+int my_hal_dftFree2D(cvhalDFT *context) {
+    MyFilterData *c = static_cast<MyFilterData*>(context);
+    delete c;
+}
+@endcode
+ */
+struct cvhalDFT {};
 
-inline int hal_ni_dftInit1D(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dft1D(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftFree1D(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context double pointer to context storing all necessary data
+@param len transformed array length
+@param count estimated transformation count
+@param depth array type (CV_32F or CV_64F)
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, CV_HAL_DFT_SCALE, ...)
+@param needBuffer pointer to boolean variable, if valid pointer provided, then variable value should be set to true to signal that additional memory buffer is needed for operations
+ */
+inline int hal_ni_dftInit1D(cvhalDFT **context, int len, int count, int depth, int flags, bool *needBuffer) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src source data
+@param dst destination data
+ */
+inline int hal_ni_dft1D(cvhalDFT *context, const uchar *src, uchar *dst) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
 #define cv_hal_dftInit1D hal_ni_dftInit1D
 #define cv_hal_dft1D hal_ni_dft1D
 #define cv_hal_dftFree1D hal_ni_dftFree1D
+//! @endcond
 
-inline int hal_ni_dftInit2D(void **, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dft2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context double pointer to context storing all necessary data
+@param width,height image dimensions
+@param depth image type (CV_32F or CV64F)
+@param src_channels number of channels in input image
+@param dst_channels number of channels in output image
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
+@param nonzero_rows number of nonzero rows in image, can be used for optimization
+ */
+inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src_data,src_step source image data and step
+@param dst_data,dst_step destination image data and step
+ */
+inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
 #define cv_hal_dftInit2D hal_ni_dftInit2D
 #define cv_hal_dft2D hal_ni_dft2D
 #define cv_hal_dftFree2D hal_ni_dftFree2D
+//! @endcond
 
+/**
+@param context double pointer to context storing all necessary data
+@param width,height image dimensions
+@param depth image type (CV_32F or CV64F)
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
+ */
+inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src_data,src_step source image data and step
+@param dst_data,dst_step destination image data and step
+ */
+inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dctFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
-inline int hal_ni_dctInit2D(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dct2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dctFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-
+//! @cond IGNORED
 #define cv_hal_dctInit2D hal_ni_dctInit2D
 #define cv_hal_dct2D hal_ni_dct2D
 #define cv_hal_dctFree2D hal_ni_dctFree2D
+//! @endcond
+
+//! @}
+
+#if defined __GNUC__
+#  pragma GCC diagnostic pop
+#elif defined _MSC_VER
+#  pragma warning( pop )
+#endif
 
 #include "custom_hal.hpp"
 
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 4e89582798..019c41f33b 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -700,8 +700,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
 
     buf.resize(bufSize);
 
-    hal::DftContext c;
-    hal::dftInit2D(c, dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
+    Ptr<hal::DFT2D> c = hal::DFT2D::create(dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
 
     // compute DFT of each template plane
     for( k = 0; k < tcn; k++ )
@@ -726,11 +725,9 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             Mat part(dst, Range(0, templ.rows), Range(templ.cols, dst.cols));
             part = Scalar::all(0);
         }
-        hal::dft2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
+        c->apply(dst.data, (int)dst.step, dst.data, (int)dst.step);
     }
 
-    hal::dftFree2D(c);
-
     int tileCountX = (corr.cols + blocksize.width - 1)/blocksize.width;
     int tileCountY = (corr.rows + blocksize.height - 1)/blocksize.height;
     int tileCount = tileCountX * tileCountY;
@@ -747,11 +744,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
     borderType |= BORDER_ISOLATED;
 
-    hal::DftContext cF, cR;
+    Ptr<hal::DFT2D> cF, cR;
     int f = CV_HAL_DFT_IS_INPLACE;
     int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
-    hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
-    hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
+    cF = hal::DFT2D::create(dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
+    cR = hal::DFT2D::create(dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
 
     // calculate correlation by blocks
     for( i = 0; i < tileCount; i++ )
@@ -791,7 +788,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
             if (bsz.height == blocksize.height)
-                hal::dft2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                cF->apply(dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, 0, dsz.height );
 
@@ -800,7 +797,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
 
             if (bsz.height == blocksize.height)
-                hal::dft2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                cR->apply(dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
 
@@ -834,8 +831,6 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             }
         }
     }
-    hal::dftFree2D(cF);
-    hal::dftFree2D(cR);
 }
 
 static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )