From e087cc8fd10d0cabf39c3d8d2980f7b34251232d Mon Sep 17 00:00:00 2001 From: llh721113 Date: Mon, 26 Jun 2023 20:37:34 +0800 Subject: [PATCH] feat: NDSRVP Filter --- 3rdparty/ndsrvp/include/imgproc.hpp | 30 +++ 3rdparty/ndsrvp/src/cvutils.cpp | 34 +++ 3rdparty/ndsrvp/src/cvutils.hpp | 160 ++++++++++++++ 3rdparty/ndsrvp/src/filter.cpp | 321 ++++++++++++++++++++++++++++ 4 files changed, 545 insertions(+) create mode 100644 3rdparty/ndsrvp/src/filter.cpp diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp index 94104f0b71..db0ee05132 100644 --- a/3rdparty/ndsrvp/include/imgproc.hpp +++ b/3rdparty/ndsrvp/include/imgproc.hpp @@ -5,6 +5,8 @@ #ifndef OPENCV_NDSRVP_IMGPROC_HPP #define OPENCV_NDSRVP_IMGPROC_HPP +struct cvhalFilter2D; + namespace cv { namespace ndsrvp { @@ -71,6 +73,34 @@ int threshold(const uchar* src_data, size_t src_step, #undef cv_hal_threshold #define cv_hal_threshold (cv::ndsrvp::threshold) +// ################ filter ################ + +int filterInit(cvhalFilter2D **context, + uchar *kernel_data, size_t kernel_step, + int kernel_type, int kernel_width, + int kernel_height, int max_width, int max_height, + int src_type, int dst_type, int borderType, + double delta, int anchor_x, int anchor_y, + bool allowSubmatrix, bool allowInplace); + +#undef cv_hal_filterInit +#define cv_hal_filterInit (cv::ndsrvp::filterInit) + +int filter(cvhalFilter2D *context, + const uchar *src_data, size_t src_step, + uchar *dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y); + +#undef cv_hal_filter +#define cv_hal_filter (cv::ndsrvp::filter) + +int filterFree(cvhalFilter2D *context); + +#undef cv_hal_filterFree +#define cv_hal_filterFree (cv::ndsrvp::filterFree) + } // namespace ndsrvp } // namespace cv diff --git a/3rdparty/ndsrvp/src/cvutils.cpp b/3rdparty/ndsrvp/src/cvutils.cpp index 48e025488f..6afac5136d 100644 --- a/3rdparty/ndsrvp/src/cvutils.cpp +++ b/3rdparty/ndsrvp/src/cvutils.cpp @@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType) return p; } +int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType) +{ + int16x4_t vzero = (int16x4_t){0, 0, 0, 0}; + int16x4_t vone = (int16x4_t){1, 1, 1, 1}; + int16x4_t vlen = (int16x4_t){len, len, len, len}; + if(borderType == CV_HAL_BORDER_REPLICATE) + vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101) + { + int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero; + if(len == 1) + return vzero; + do + { + int16x4_t vneg = -vp - 1 + vdelta; + int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta; + vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + } + while( (long)(vp >= vlen) || (long)(vp < 0) ); + } + else if(borderType == CV_HAL_BORDER_WRAP) + { + ndsrvp_assert(len > 0); + int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen; + int16x4_t vpos = vp % vlen; + vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0)); + } + else if(borderType == CV_HAL_BORDER_CONSTANT) + vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen)); + else + ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type"); + return vp; +} + } // namespace ndsrvp } // namespace cv diff --git a/3rdparty/ndsrvp/src/cvutils.hpp b/3rdparty/ndsrvp/src/cvutils.hpp index 8cf1476ed6..78bb11d95f 100644 --- a/3rdparty/ndsrvp/src/cvutils.hpp +++ b/3rdparty/ndsrvp/src/cvutils.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -26,16 +27,26 @@ namespace ndsrvp { void* fastMalloc(size_t size); void fastFree(void* ptr); int borderInterpolate(int p, int len, int borderType); +int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType); #ifndef MAX # define MAX(a,b) ((a) < (b) ? (b) : (a)) #endif +#ifndef MIN +# define MIN(a,b) ((a) > (b) ? (b) : (a)) +#endif + #define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT) #define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1) +#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15) +#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type)) + #define CV_MALLOC_ALIGN 64 +inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); } + // error codes enum Error{ @@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b) return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a)); } +// expand + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +810 [ 0 ] [ 1 ] [ 4 ] [ 5 ] +832 [ 2 ] [ 3 ] [ 6 ] [ 7 ] +bb [ 0 ] [ 1 ] [ 2 ] [ 3 ] +tt [ 4 ] [ 5 ] [ 6 ] [ 7 ] +*/ + +inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst) +{ + unsigned long vs810 = __nds__zunpkd810(vs); + unsigned long vs832 = __nds__zunpkd832(vs); + *(unsigned long*)dst = __nds__pkbb32(vs832, vs810); + *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810); +} + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +820 [ 0 ] [ 2 ] [ 4 ] [ 6 ] +831 [ 1 ] [ 3 ] [ 5 ] [ 7 ] +bb [ 0 ] [ 2 ] [ 1 ] [ 3 ] +tt [ 4 ] [ 6 ] [ 5 ] [ 7 ] +*/ + +inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst) +{ + unsigned long vs820 = __nds__zunpkd820(vs); + unsigned long vs831 = __nds__zunpkd831(vs); + *(unsigned long*)dst = __nds__pkbb32(vs831, vs820); + *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820); +} + +/* + [0] [1] [2] [3] [4] [5] [6] [7] +820 [ 0 ] [ 2 ] [ 4 ] [ 6 ] +831 [ 1 ] [ 3 ] [ 5 ] [ 7 ] +bb [ 0 ] [ 2 ] [ 1 ] [ 3 ] +tt [ 4 ] [ 6 ] [ 5 ] [ 7 ] +bbbb[ 0 ] [ 1 ] +bbtt[ 2 ] [ 3 ] +ttbb[ 4 ] [ 5 ] +tttt[ 6 ] [ 7 ] +*/ + + +inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst) +{ + unsigned long vs820 = __nds__zunpkd820(vs); + unsigned long vs831 = __nds__zunpkd831(vs); + unsigned long vsbb = __nds__pkbb32(vs831, vs820); + unsigned long vstt = __nds__pktt32(vs831, vs820); + *(unsigned long*)dst = __nds__pkbb16(0, vsbb); + *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb); + *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt); + *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt); +} + +// float replacement + +inline void ndsrvp_f32_add8(const float* a, const float* b, float* c) +{ + c[0] = a[0] + b[0]; + c[1] = a[1] + b[1]; + c[2] = a[2] + b[2]; + c[3] = a[3] + b[3]; + c[4] = a[4] + b[4]; + c[5] = a[5] + b[5]; + c[6] = a[6] + b[6]; + c[7] = a[7] + b[7]; +} + +/* + [1] [8] [23] + [24] [8] +*/ + +inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact +{ + const int mask_frac = 0x007FFFFF; + const int mask_sign = 0x7FFFFFFF; + const int mask_lead = 0x40000000; + const int ofs_exp = 23; + + uint32x2_t va01 = *(uint32x2_t*)a; + uint32x2_t va23 = *(uint32x2_t*)(a + 2); + uint32x2_t va45 = *(uint32x2_t*)(a + 4); + uint32x2_t va67 = *(uint32x2_t*)(a + 6); + + uint32x2_t vaexp01 = va01 >> ofs_exp; + uint32x2_t vaexp23 = va23 >> ofs_exp; + uint32x2_t vaexp45 = va45 >> ofs_exp; + uint32x2_t vaexp67 = va67 >> ofs_exp; + + uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead; + uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead; + + int16x4_t vb[2]; // fake signed for signed multiply + ndsrvp_u8_u16_eswap8(b, (ushort*)vb); + + vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]); + vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]); + vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]); + vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]); + + uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8; + uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8; + uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8; + uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8; + + vaexp01 += 8 - vaclz01; + vaexp23 += 8 - vaclz23; + vaexp45 += 8 - vaclz45; + vaexp67 += 8 - vaclz67; + + vafrac01 <<= vaclz01; + vafrac23 <<= vaclz23; + vafrac45 <<= vaclz45; + vafrac67 <<= vaclz67; + + *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac); + *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac); + *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac); + *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac); +} + // saturate template static inline _Tp saturate_cast(int v) { return _Tp(v); } @@ -94,6 +234,26 @@ template<> inline short saturate_cast(double v) { return saturate_cas template<> inline int saturate_cast(float v) { return (int)lrintf(v); } template<> inline int saturate_cast(double v) { return (int)lrint(v); } +inline double cast_ptr_to_double(const uchar* v, int depth) { + switch (depth) { + case CV_8U: return (double)*(uchar*)v; + case CV_8S: return (double)*(char*)v; + case CV_16U: return (double)*(ushort*)v; + case CV_16S: return (double)*(short*)v; + case CV_32S: return (double)*(int*)v; + case CV_32F: return (double)*(float*)v; + case CV_64F: return (double)*(double*)v; + case CV_16F: return (double)*(float*)v; + default: return 0; + } +} + +template +inline _Tp data_at(const uchar* data, int step, int y, int x, int cn) +{ + return ((_Tp*)(data + y * step))[x * cn]; +} + // align inline long align(size_t v, int n) diff --git a/3rdparty/ndsrvp/src/filter.cpp b/3rdparty/ndsrvp/src/filter.cpp new file mode 100644 index 0000000000..89508eea11 --- /dev/null +++ b/3rdparty/ndsrvp/src/filter.cpp @@ -0,0 +1,321 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "ndsrvp_hal.hpp" +#include "opencv2/imgproc/hal/interface.h" +#include "cvutils.hpp" + +namespace cv { + +namespace ndsrvp { + +class FilterData +{ +public: + FilterData(uchar *_kernel_data, size_t _kernel_step, int _kernel_type, int _src_type, int _dst_type, int _borderType, + int _kernel_width, int _kernel_height, int _max_width, int _max_height, double _delta, int _anchor_x, int _anchor_y) + : kernel_data(_kernel_data), kernel_step(_kernel_step), kernel_type(_kernel_type), src_type(_src_type), dst_type(_dst_type), borderType(_borderType), + kernel_width(_kernel_width), kernel_height(_kernel_height), max_width(_max_width), max_height(_max_height), delta(_delta), anchor_x(_anchor_x), anchor_y(_anchor_y) + { + } + + uchar *kernel_data; + size_t kernel_step; // bytes between rows(height) + int kernel_type, src_type, dst_type, borderType; + int kernel_width, kernel_height; + int max_width, max_height; + double delta; + int anchor_x, anchor_y; + std::vector coords; + std::vector coeffs; + int nz; + std::vector padding; +}; + +static int countNonZero(const FilterData* ctx) +{ + int i, j, nz = 0; + const uchar* ker_row = ctx->kernel_data; + for( i = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step ) + { + for( j = 0; j < ctx->kernel_width; j++ ) + { + if( ((float*)ker_row)[j] != 0.0 ) + nz++; + } + } + return nz; +} + +static void preprocess2DKernel(FilterData* ctx) +{ + int i, j, k, nz = countNonZero(ctx), ktype = ctx->kernel_type; + if(nz == 0) + nz = 1; // (0, 0) == 0 by default + ndsrvp_assert( ktype == CV_32F ); + + ctx->coords.resize(nz * 2); + ctx->coeffs.resize(nz); + + const uchar* ker_row = ctx->kernel_data; + for( i = k = 0; i < ctx->kernel_height; i++, ker_row += ctx->kernel_step ) + { + for( j = 0; j < ctx->kernel_width; j++ ) + { + float val = ((float*)ker_row)[j]; + if( val == 0.0 ) + continue; + ctx->coords[k * 2] = j; + ctx->coords[k * 2 + 1] = i; + ctx->coeffs[k++] = val; + } + } + + ctx->nz = k; +} + +int filterInit(cvhalFilter2D **context, + uchar *kernel_data, size_t kernel_step, + int kernel_type, int kernel_width, + int kernel_height, int max_width, int max_height, + int src_type, int dst_type, int borderType, + double delta, int anchor_x, int anchor_y, + bool allowSubmatrix, bool allowInplace) +{ + int sdepth = CV_MAT_DEPTH(src_type), ddepth = CV_MAT_DEPTH(dst_type); + int cn = CV_MAT_CN(src_type), kdepth = kernel_type; + + (void)allowSubmatrix; + (void)allowInplace; + + if(delta - (int)delta != 0.0) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + if(kdepth != CV_32F || (sdepth != CV_8U && sdepth != CV_16U) || ddepth != sdepth) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + FilterData *ctx = new FilterData(kernel_data, kernel_step, kernel_type, src_type, dst_type, borderType, + kernel_width, kernel_height, max_width, max_height, delta, anchor_x, anchor_y); + + *context = (cvhalFilter2D*)ctx; + + ndsrvp_assert(cn == CV_MAT_CN(dst_type) && ddepth >= sdepth); + + preprocess2DKernel(ctx); + + return CV_HAL_ERROR_OK; +} + +int filter(cvhalFilter2D *context, + const uchar *src_data, size_t src_step, + uchar *dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y) +{ + FilterData *ctx = (FilterData*)context; + + int cn = CV_MAT_CN(ctx->src_type); + int cnes = CV_ELEM_SIZE(ctx->src_type); + int ddepth = CV_MAT_DEPTH(ctx->dst_type); + float delta_sat = (uchar)(ctx->delta); + if(ddepth == CV_8U) + delta_sat = (float)saturate_cast(ctx->delta); + else if(ddepth == CV_16U) + delta_sat = (float)saturate_cast(ctx->delta); + + // fetch original image data + const uchar *ogn_data = src_data - offset_y * src_step - offset_x * cnes; + int ogn_step = src_step; + + // ROI fully used in the computation + int cal_width = width + ctx->kernel_width - 1; + int cal_height = height + ctx->kernel_height - 1; + int cal_x = offset_x - ctx->anchor_x; + int cal_y = offset_y - ctx->anchor_y; + + // calculate source border + ctx->padding.resize(cal_width * cal_height * cnes); + uchar* pad_data = &ctx->padding[0]; + int pad_step = cal_width * cnes; + + uchar* pad_ptr; + const uchar* ogn_ptr; + std::vector vec_zeros(cnes, 0); + for(int i = 0; i < cal_height; i++) + { + int y = borderInterpolate(i + cal_y, full_height, ctx->borderType); + if(y < 0) { + memset(pad_data + i * pad_step, 0, cnes * cal_width); + continue; + } + + // left border + int j = 0; + int16x4_t vj = {0, 1, 2, 3}; + vj += saturate_cast(cal_x); + for(; j + cal_x < -4; j += 4, vj += 4) + { + int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType); + for(int k = 0; k < 4; k++) { + if(vx[k] < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes; + pad_ptr = pad_data + i * pad_step + (j + k) * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + for(; j + cal_x < 0; j++) + { + int x = borderInterpolate(j + cal_x, full_width, ctx->borderType); + if(x < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + x * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + + // center + int rborder = MIN(cal_width, full_width - cal_x); + ogn_ptr = ogn_data + y * ogn_step + (j + cal_x) * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes * (rborder - j)); + + // right border + j = rborder; + vj = (int16x4_t){0, 1, 2, 3} + saturate_cast(cal_x + rborder); + for(; j <= cal_width - 4; j += 4, vj += 4) + { + int16x4_t vx = borderInterpolate_vector(vj, full_width, ctx->borderType); + for(int k = 0; k < 4; k++) { + if(vx[k] < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + vx[k] * cnes; + pad_ptr = pad_data + i * pad_step + (j + k) * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + for(; j < cal_width; j++) + { + int x = borderInterpolate(j + cal_x, full_width, ctx->borderType); + if(x < 0) // border constant return value -1 + ogn_ptr = &vec_zeros[0]; + else + ogn_ptr = ogn_data + y * ogn_step + x * cnes; + pad_ptr = pad_data + i * pad_step + j * cnes; + memcpy(pad_ptr, ogn_ptr, cnes); + } + } + + // prepare the pointers + int i, k, count, nz = ctx->nz; + const uchar* ker_pts = &ctx->coords[0]; + const float* ker_cfs = &ctx->coeffs[0]; + + if( ddepth == CV_8U ) + { + std::vector src_ptrarr; + src_ptrarr.resize(nz); + uchar** src_ptrs = &src_ptrarr[0]; + uchar* dst_row = dst_data; + uchar* pad_row = pad_data; + + for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step ) + { + for( k = 0; k < nz; k++ ) + src_ptrs[k] = (uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes; + + i = 0; + for( ; i <= width * cnes - 8; i += 8 ) + { + float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat, delta_sat}; + for( k = 0; k < nz; k++ ) { + float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]}; + // experimental code + // ndsrvp_f32_u8_mul8(vker_cfs, *(unsigned long*)(src_ptrs[k] + i), vker_cfs); + // ndsrvp_f32_add8(vs0, vker_cfs, vs0); + vs0[0] += vker_cfs[0] * src_ptrs[k][i]; + vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1]; + vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2]; + vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3]; + vs0[4] += vker_cfs[4] * src_ptrs[k][i + 4]; + vs0[5] += vker_cfs[5] * src_ptrs[k][i + 5]; + vs0[6] += vker_cfs[6] * src_ptrs[k][i + 6]; + vs0[7] += vker_cfs[7] * src_ptrs[k][i + 7]; + } + dst_row[i] = saturate_cast(vs0[0]); + dst_row[i + 1] = saturate_cast(vs0[1]); + dst_row[i + 2] = saturate_cast(vs0[2]); + dst_row[i + 3] = saturate_cast(vs0[3]); + dst_row[i + 4] = saturate_cast(vs0[4]); + dst_row[i + 5] = saturate_cast(vs0[5]); + dst_row[i + 6] = saturate_cast(vs0[6]); + dst_row[i + 7] = saturate_cast(vs0[7]); + } + for( ; i < width * cnes; i++ ) + { + float s0 = delta_sat; + for( k = 0; k < nz; k++ ) { + s0 += ker_cfs[k] * src_ptrs[k][i]; + } + dst_row[i] = saturate_cast(s0); + } + } + } + else if( ddepth == CV_16U ) + { + std::vector src_ptrarr; + src_ptrarr.resize(nz); + ushort** src_ptrs = &src_ptrarr[0]; + uchar* dst_row = dst_data; + uchar* pad_row = pad_data; + + for( count = 0; count < height; count++, dst_row += dst_step, pad_row += pad_step ) + { + for( k = 0; k < nz; k++ ) + src_ptrs[k] = (ushort*)((uchar*)pad_row + ker_pts[k * 2 + 1] * pad_step + ker_pts[k * 2] * cnes); + + i = 0; + for( ; i <= width * cn - 4; i += 4 ) + { + float vs0[8] = {delta_sat, delta_sat, delta_sat, delta_sat}; + for( k = 0; k < nz; k++ ) { + float vker_cfs[8] = {ker_cfs[k], ker_cfs[k], ker_cfs[k], ker_cfs[k]}; + vs0[0] += vker_cfs[0] * src_ptrs[k][i]; + vs0[1] += vker_cfs[1] * src_ptrs[k][i + 1]; + vs0[2] += vker_cfs[2] * src_ptrs[k][i + 2]; + vs0[3] += vker_cfs[3] * src_ptrs[k][i + 3]; + } + ushort* dst_row_ptr = (ushort*)dst_row; + dst_row_ptr[i] = saturate_cast(vs0[0]); + dst_row_ptr[i + 1] = saturate_cast(vs0[1]); + dst_row_ptr[i + 2] = saturate_cast(vs0[2]); + dst_row_ptr[i + 3] = saturate_cast(vs0[3]); + } + for( ; i < width * cn; i++ ) + { + float s0 = delta_sat; + for( k = 0; k < nz; k++ ) { + s0 += ker_cfs[k] * src_ptrs[k][i]; + } + ((ushort*)dst_row)[i] = saturate_cast(s0); + } + } + } + + return CV_HAL_ERROR_OK; +} + +int filterFree(cvhalFilter2D *context) { + FilterData *ctx = (FilterData*)context; + delete ctx; + return CV_HAL_ERROR_OK; +} + +} // namespace ndsrvp + +} // namespace cv