Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

4088 lines
151 KiB

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, 2017, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/* ////////////////////////////////////////////////////////////////////
//
// Geometrical transforms on images and matrices: rotation, zoom etc.
//
// */
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "hal_replacement.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
#include "resize.hpp"
#include "opencv2/core/softfloat.hpp"
#include "fixedpoint.inl.hpp"
using namespace cv;
namespace
{
template <typename ET, bool needsign> struct fixedtype { typedef fixedpoint64 type; };
template <> struct fixedtype<uint32_t, false> { typedef ufixedpoint64 type; };
template <bool needsign> struct fixedtype<int16_t, needsign> { typedef fixedpoint32 type; };
template <> struct fixedtype<uint16_t, false> { typedef ufixedpoint32 type; };
template <bool needsign> struct fixedtype<int8_t, needsign> { typedef fixedpoint32 type; };
template <> struct fixedtype<uint8_t, false> { typedef ufixedpoint16 type; };
//FT is fixedtype<ET, needsign>::type
template <typename ET, typename FT, int n, bool mulall>
static void hlineResize(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
for (; i < dst_min; i++, m += n) // Points that fall left from src image so became equal to leftmost src point
{
for (int j = 0; j < cn; j++, dst++)
{
*dst = src[j];
}
}
for (; i < dst_max; i++, m += n)
{
ET* src_ofst = src + cn*ofst[i];
for (int j = 0; j < cn; j++, dst++)
{
*dst = (mulall || !m[0].isZero()) ? m[0] * src_ofst[j] : FT::zero();
for (int k = 1; k < n; k++)
{
*dst = *dst + ((mulall || !m[k].isZero()) ? m[k] * src_ofst[j+k*cn] : FT::zero());
}
}
}
ET* src_last = src + cn*ofst[dst_width - 1];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
for (int j = 0; j < cn; j++, dst++)
{
*dst = src_last[j];
}
}
}
template <typename ET, typename FT, int n, bool mulall, int cncnt> struct hline
{
static void ResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
hlineResize<ET, FT, n, mulall>(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
}
};
template <typename ET, typename FT> struct hline<ET, FT, 2, true, 1>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]);
for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
}
for (; i < dst_max; i++, m += 2)
{
ET* px = src + ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[1];
}
src0 = (src + ofst[dst_width - 1])[0];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 2, true, 2>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]);
for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
}
for (; i < dst_max; i++, m += 2)
{
ET* px = src + 2*ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[2];
*(dst++) = m[0] * px[1] + m[1] * px[3];
}
src0 = (src + 2*ofst[dst_width - 1])[0];
src1 = (src + 2*ofst[dst_width - 1])[1];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 2, true, 3>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]), src2(src[2]);
for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
}
for (; i < dst_max; i++, m += 2)
{
ET* px = src + 3*ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[3];
*(dst++) = m[0] * px[1] + m[1] * px[4];
*(dst++) = m[0] * px[2] + m[1] * px[5];
}
src0 = (src + 3*ofst[dst_width - 1])[0];
src1 = (src + 3*ofst[dst_width - 1])[1];
src2 = (src + 3*ofst[dst_width - 1])[2];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 2, true, 4>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
for (; i < dst_min; i++, m += 2) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
*(dst++) = src3;
}
for (; i < dst_max; i++, m += 2)
{
ET* px = src + 4*ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[4];
*(dst++) = m[0] * px[1] + m[1] * px[5];
*(dst++) = m[0] * px[2] + m[1] * px[6];
*(dst++) = m[0] * px[3] + m[1] * px[7];
}
src0 = (src + 4*ofst[dst_width - 1])[0];
src1 = (src + 4*ofst[dst_width - 1])[1];
src2 = (src + 4*ofst[dst_width - 1])[2];
src3 = (src + 4*ofst[dst_width - 1])[3];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
*(dst++) = src3;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 4, true, 1>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]);
for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
}
for (; i < dst_max; i++, m += 4)
{
ET* px = src + ofst[i];
*(dst++) = m[0] * src[0] + m[1] * src[1] + m[2] * src[2] + m[3] * src[3];
}
src0 = (src + ofst[dst_width - 1])[0];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 4, true, 2>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]);
for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
}
for (; i < dst_max; i++, m += 4)
{
ET* px = src + 2*ofst[i];
*(dst++) = m[0] * src[0] + m[1] * src[2] + m[2] * src[4] + m[3] * src[6];
*(dst++) = m[0] * src[1] + m[1] * src[3] + m[2] * src[5] + m[3] * src[7];
}
src0 = (src + 2*ofst[dst_width - 1])[0];
src1 = (src + 2*ofst[dst_width - 1])[1];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 4, true, 3>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]), src2(src[2]);
for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
}
for (; i < dst_max; i++, m += 4)
{
ET* px = src + 3*ofst[i];
*(dst++) = m[0] * src[0] + m[1] * src[3] + m[2] * src[6] + m[3] * src[ 9];
*(dst++) = m[0] * src[1] + m[1] * src[4] + m[2] * src[7] + m[3] * src[10];
*(dst++) = m[0] * src[2] + m[1] * src[5] + m[2] * src[8] + m[3] * src[11];
}
src0 = (src + 3*ofst[dst_width - 1])[0];
src1 = (src + 3*ofst[dst_width - 1])[1];
src2 = (src + 3*ofst[dst_width - 1])[2];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
}
}
};
template <typename ET, typename FT> struct hline<ET, FT, 4, true, 4>
{
static void ResizeCn(ET* src, int, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
FT src0(src[0]), src1(src[1]), src2(src[2]), src3(src[3]);
for (; i < dst_min; i++, m += 4) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
*(dst++) = src3;
}
for (; i < dst_max; i++, m += 4)
{
ET* px = src + 4*ofst[i];
*(dst++) = m[0] * src[0] + m[1] * src[4] + m[2] * src[ 8] + m[3] * src[12];
*(dst++) = m[0] * src[1] + m[1] * src[5] + m[2] * src[ 9] + m[3] * src[13];
*(dst++) = m[0] * src[2] + m[1] * src[6] + m[2] * src[10] + m[3] * src[14];
*(dst++) = m[0] * src[3] + m[1] * src[7] + m[2] * src[11] + m[3] * src[15];
}
src0 = (src + 4*ofst[dst_width - 1])[0];
src1 = (src + 4*ofst[dst_width - 1])[1];
src2 = (src + 4*ofst[dst_width - 1])[2];
src3 = (src + 4*ofst[dst_width - 1])[3];
for (; i < dst_width; i++) // Points that fall right from src image so became equal to rightmost src point
{
*(dst++) = src0;
*(dst++) = src1;
*(dst++) = src2;
*(dst++) = src3;
}
}
};
template <typename ET, typename FT, int n, bool mulall, int cncnt>
static void hlineResizeCn(ET* src, int cn, int *ofst, FT* m, FT* dst, int dst_min, int dst_max, int dst_width)
{
hline<ET, FT, n, mulall, cncnt>::ResizeCn(src, cn, ofst, m, dst, dst_min, dst_max, dst_width);
};
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
ufixedpoint16 src_0(src[0]);
v_uint16x8 v_src_0 = v_setall_u16(*((uint16_t*)&src_0));
for (; i < dst_min - 7; i += 8, m += 16, dst += 8) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_src_0);
}
for (; i < dst_min; i++, m += 2)
{
*(dst++) = src_0;
}
for (; i < dst_max - 7 && ofst[i + 7] + 15 <= ofst[dst_width - 1]; i += 8, m += 16, dst += 8)
{
v_uint32x4 v_src01 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i ])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 1])));
v_uint32x4 v_src23 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 3])));
v_uint32x4 v_src45 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 4])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 5])));
v_uint32x4 v_src67 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + ofst[i + 6])), v_reinterpret_as_u32(v_load_expand(src + ofst[i + 7])));
v_uint32x4 v_zip02, v_zip13, v_zip46, v_zip57;
v_zip(v_src01, v_src23, v_zip02, v_zip13);
v_zip(v_src45, v_src67, v_zip46, v_zip57);
v_uint32x4 v_src0, v_src1;
v_zip(v_combine_low(v_zip02, v_zip46), v_combine_low(v_zip13, v_zip57), v_src0, v_src1);
v_int16x8 v_mul0 = v_load((int16_t*)m);
v_int16x8 v_mul1 = v_load((int16_t*)m + 8);
v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src0), v_mul0));
v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_reinterpret_as_s16(v_src1), v_mul1));
v_store((uint16_t*)dst, v_pack(v_res0, v_res1));
}
for (; i < dst_max; i += 1, m += 2)
{
uint8_t* px = src + ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[1];
}
src_0 = (src + ofst[dst_width - 1])[0];
v_src_0 = v_setall_u16(*((uint16_t*)&src_0));
for (; i < dst_width - 7; i += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_src_0);
}
for (; i < dst_width; i++)
{
*(dst++) = src_0;
}
}
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
ufixedpoint16 srccn[8] = { src[0], src[1], src[0], src[1], src[0], src[1], src[0], src[1] };
v_uint16x8 v_srccn = v_load((uint16_t*)srccn);
for (; i < dst_min - 3; i += 4, m += 8, dst += 8) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
for (; i < dst_min; i++, m += 2)
{
*(dst++) = srccn[0];
*(dst++) = srccn[1];
}
for (; i < dst_max - 3 && ofst[i + 3] + 7 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 8)
{
v_uint32x4 v_src0 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i ])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 1])));
v_uint32x4 v_src1 = v_combine_low(v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 2])), v_reinterpret_as_u32(v_load_expand(src + 2 * ofst[i + 3])));
v_uint32x4 v_zip0, v_zip1;
v_zip(v_src0, v_src1, v_zip0, v_zip1);
v_zip(v_zip0, v_zip1, v_src0, v_src1);
v_int16x8 v_src0123, v_src4567;
v_zip(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_src1), v_src0123, v_src4567);
v_uint32x4 v_mul = v_load((uint32_t*)m);//AaBbCcDd
v_zip(v_mul, v_mul, v_zip0, v_zip1);//AaAaBbBb CcCcDdDd
v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src0123, v_reinterpret_as_s16(v_zip0)));
v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src4567, v_reinterpret_as_s16(v_zip1)));
v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
}
for (; i < dst_max; i += 1, m += 2)
{
uint8_t* px = src + 2 * ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[2];
*(dst++) = m[0] * px[1] + m[1] * px[3];
}
srccn[0] = (src + 2 * ofst[dst_width - 1])[0]; srccn[1] = (src + 2 * ofst[dst_width - 1])[1]; srccn[2] = (src + 2 * ofst[dst_width - 1])[0]; srccn[3] = (src + 2 * ofst[dst_width - 1])[1];
srccn[4] = (src + 2 * ofst[dst_width - 1])[0]; srccn[5] = (src + 2 * ofst[dst_width - 1])[1]; srccn[6] = (src + 2 * ofst[dst_width - 1])[0]; srccn[7] = (src + 2 * ofst[dst_width - 1])[1];
v_srccn = v_load((uint16_t*)srccn);
for (; i < dst_width - 3; i += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
for (; i < dst_width; i++)
{
*(dst++) = srccn[0];
*(dst++) = srccn[1];
}
}
template <>
void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *ofst, ufixedpoint16* m, ufixedpoint16* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
ufixedpoint16 srccn[8] = { src[0], src[1], src[2], src[3], src[0], src[1], src[2], src[3] };
v_uint16x8 v_srccn = v_load((uint16_t*)srccn);
for (; i < dst_min - 1; i += 2, m += 4, dst += 8) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
if (i < dst_min) // Points that fall left from src image so became equal to leftmost src point
{
*(dst++) = srccn[0];
*(dst++) = srccn[1];
*(dst++) = srccn[2];
*(dst++) = srccn[3];
i++; m += 2;
}
for (; i < dst_max - 1 && ofst[i + 1] + 3 <= ofst[dst_width - 1]; i += 2, m += 4, dst += 8)
{
v_int16x8 v_src01 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i ]));
v_int16x8 v_src23 = v_reinterpret_as_s16(v_load_expand(src + 4 * ofst[i + 1]));
v_int16x8 v_tmp0, v_tmp1;
v_recombine(v_src01, v_src23, v_tmp0, v_tmp1);
v_zip(v_tmp0, v_tmp1, v_src01, v_src23);
v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0]));//AaAaAaAa
v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[1]));//BbBbBbBb
v_uint32x4 v_res0 = v_reinterpret_as_u32(v_dotprod(v_src01, v_mul01));
v_uint32x4 v_res1 = v_reinterpret_as_u32(v_dotprod(v_src23, v_mul23));
v_store((uint16_t*)dst, v_pack(v_res0, v_res1));//AB1AB2CD1CD2
}
for (; i < dst_max; i += 1, m += 2)
{
uint8_t* px = src + 4 * ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[4];
*(dst++) = m[0] * px[1] + m[1] * px[5];
*(dst++) = m[0] * px[2] + m[1] * px[6];
*(dst++) = m[0] * px[3] + m[1] * px[7];
}
srccn[0] = (src + 4 * ofst[dst_width - 1])[0]; srccn[1] = (src + 4 * ofst[dst_width - 1])[1]; srccn[2] = (src + 4 * ofst[dst_width - 1])[2]; srccn[3] = (src + 4 * ofst[dst_width - 1])[3];
srccn[4] = (src + 4 * ofst[dst_width - 1])[0]; srccn[5] = (src + 4 * ofst[dst_width - 1])[1]; srccn[6] = (src + 4 * ofst[dst_width - 1])[2]; srccn[7] = (src + 4 * ofst[dst_width - 1])[3];
v_srccn = v_load((uint16_t*)srccn);
for (; i < dst_width - 1; i += 2, dst += 8) // Points that fall right from src image so became equal to rightmost src point
{
v_store((uint16_t*)dst, v_srccn);
}
if (i < dst_width)
{
*(dst++) = srccn[0];
*(dst++) = srccn[1];
*(dst++) = srccn[2];
*(dst++) = srccn[3];
}
}
template <>
void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int *ofst, ufixedpoint32* m, ufixedpoint32* dst, int dst_min, int dst_max, int dst_width)
{
int i = 0;
ufixedpoint32 src_0(src[0]);
v_uint32x4 v_src_0 = v_setall_u32(*((uint32_t*)&src_0));
for (; i < dst_min - 3; i += 4, m += 8, dst += 4) // Points that fall left from src image so became equal to leftmost src point
{
v_store((uint32_t*)dst, v_src_0);
}
for (; i < dst_min; i++, m += 2)
{
*(dst++) = src_0;
}
for (; i < dst_max - 3 && ofst[i + 3] + 8 <= ofst[dst_width - 1]; i += 4, m += 8, dst += 4)
{
v_uint32x4 v_src0 = v_combine_low(v_load_expand(src + ofst[i]), v_load_expand(src + ofst[i + 1]));
v_uint32x4 v_mul0 = v_load((uint32_t*)m);
v_uint32x4 v_src1 = v_combine_low(v_load_expand(src + ofst[i + 2]), v_load_expand(src + ofst[i + 3]));
v_uint32x4 v_mul1 = v_load((uint32_t*)m + 4);
v_uint32x4 v_res0 = v_src0 * v_mul0;//a1a2b1b2
v_uint32x4 v_res1 = v_src1 * v_mul1;//c1c2d1d2
v_uint32x4 v_tmp0, v_tmp1;
v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1a2c1c2 b1b2d1d2
v_zip(v_tmp0, v_tmp1, v_res0, v_res1);//a1b1a2b2 c1d1c2d2
v_recombine(v_res0, v_res1, v_tmp0, v_tmp1);//a1b1c1d1 a2b2c2d2
v_store((uint32_t*)dst, v_tmp0 + v_tmp1);//abcd
}
for (; i < dst_max; i += 1, m += 2)
{
uint16_t* px = src + ofst[i];
*(dst++) = m[0] * px[0] + m[1] * px[1];
}
src_0 = (src + ofst[dst_width - 1])[0];
v_src_0 = v_setall_u32(*((uint32_t*)&src_0));
for (; i < dst_width - 3; i += 4, dst += 4)
{
v_store((uint32_t*)dst, v_src_0);
}
for (; i < dst_width; i++)
{
*(dst++) = src_0;
}
}
template <typename ET, typename FT>
void vlineSet(FT* src, ET* dst, int dst_width)
{
for (int i = 0; i < dst_width; i++)
dst[i] = src[i];
}
template <>
void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
{
static const v_uint16x8 v_fixedRound = v_setall_u16((uint16_t)((1U << 8) >> 1));
int i = 0;
for (; i < dst_width - 15; i += 16, src += 16, dst += 16)
{
v_uint16x8 v_src0 = v_load((uint16_t*)src);
v_uint16x8 v_src1 = v_load((uint16_t*)src + 8);
v_uint16x8 v_res0 = (v_src0 + v_fixedRound) >> 8;
v_uint16x8 v_res1 = (v_src1 + v_fixedRound) >> 8;
v_store(dst, v_pack(v_res0, v_res1));
}
for (; i < dst_width; i++)
*(dst++) = *(src++);
}
template <typename ET, typename FT, int n>
void vlineResize(FT* src, size_t src_step, FT* m, ET* dst, int dst_width)
{
for (int i = 0; i < dst_width; i++)
{
typename FT::WT res = src[i] * m[0];
for (int k = 1; k < n; k++)
res = res + src[i + k*src_step] * m[k];
dst[i] = res;
}
}
template <>
void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step, ufixedpoint16* m, uint8_t* dst, int dst_width)
{
static const v_int32x4 v_fixedRound = v_setall_s32((int32_t)((1 << 16) >> 1));
static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1<<15));
static const v_int8x16 v_128_16 = v_reinterpret_as_s8 (v_setall_u8 ((uint8_t) 1<<7));
int i = 0;
ufixedpoint16* src1 = src + src_step;
v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(((uint32_t*)m)[0]));
for (; i < dst_width - 15; i += 16, src += 16, src1 += 16, dst += 16)
{
v_int16x8 v_src00 = v_load((int16_t*)src);
v_int16x8 v_src10 = v_load((int16_t*)src1);
v_int16x8 v_tmp0, v_tmp1;
v_zip(v_add_wrap(v_src00,v_128), v_add_wrap(v_src10,v_128), v_tmp0, v_tmp1);
v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul);
v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul);
v_int16x8 v_src01 = v_load((int16_t*)src + 8);
v_int16x8 v_src11 = v_load((int16_t*)src1 + 8);
v_zip(v_add_wrap(v_src01,v_128), v_add_wrap(v_src11,v_128), v_tmp0, v_tmp1);
v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul);
v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul);
v_int8x16 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
(v_res1 + v_fixedRound) >> 16),
v_pack((v_res2 + v_fixedRound) >> 16,
(v_res3 + v_fixedRound) >> 16));
v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
}
for (; i < dst_width; i++)
{
*(dst++) = (uint8_t)(*(src++) * m[0] + *(src1++) * m[1]);
}
}
template <typename ET> class interpolationLinear
{
public:
static const int len = 2;
static const bool needsign = false;
interpolationLinear(double inv_scale, int srcsize, int dstsize) : scale(softdouble::one() / softdouble(inv_scale)), maxsize(srcsize), minofst(0), maxofst(dstsize) {}
void getCoeffs(int val, int* offset, typename fixedtype<ET, needsign>::type* coeffs)
{
typedef typename fixedtype<ET, needsign>::type fixedpoint;
softdouble fval = scale*(softdouble(val)+softdouble(0.5))-softdouble(0.5);
int ival = cvFloor(fval);
if (ival >= 0 && maxsize > 1)
{
if (ival < maxsize - 1)
{
*offset = ival;
coeffs[1] = fval - softdouble(ival);
coeffs[0] = fixedpoint::one() - coeffs[1];
}
else
{
*offset = maxsize - 1;
maxofst = min(maxofst, val);
}
}
else
{
minofst = max(minofst, val + 1);
}
}
void getMinMax(int &min, int &max)
{
min = minofst;
max = maxofst;
}
protected:
softdouble scale;
int maxsize;
int minofst, maxofst;
};
template <typename ET, typename FT, int interp_y_len>
class resize_bitExactInvoker :
public ParallelLoopBody
{
public:
typedef FT fixedpoint;
typedef void(*hResizeFunc)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
resize_bitExactInvoker(const uchar* _src, size_t _src_step, int _src_width, int _src_height,
uchar* _dst, size_t _dst_step, int _dst_width, int _dst_height,
int _cn, int *_xoffsets, int *_yoffsets, fixedpoint *_xcoeffs, fixedpoint *_ycoeffs,
int _min_x, int _max_x, int _min_y, int _max_y, hResizeFunc _hResize) : ParallelLoopBody(),
src(_src), src_step(_src_step), src_width(_src_width), src_height(_src_height),
dst(_dst), dst_step(_dst_step), dst_width(_dst_width), dst_height(_dst_height),
cn(_cn), xoffsets(_xoffsets), yoffsets(_yoffsets), xcoeffs(_xcoeffs), ycoeffs(_ycoeffs),
min_x(_min_x), max_x(_max_x), min_y(_min_y), max_y(_max_y), hResize(_hResize) {}
virtual void operator() (const Range& range) const
{
AutoBuffer<fixedpoint> linebuf(interp_y_len * dst_width * cn);
int last_eval = - interp_y_len;
int evalbuf_start = 0;
int rmin_y = max(min_y, range.start);
int rmax_y = min(max_y, range.end);
if (range.start < min_y)
{
last_eval = 1 - interp_y_len;
evalbuf_start = 1;
hResize((ET*)src, cn, xoffsets, xcoeffs, (fixedpoint*)linebuf, min_x, max_x, dst_width);
}
int dy = range.start;
for (; dy < rmin_y; dy++)
vlineSet<ET, FT>((fixedpoint*)linebuf, (ET*)(dst + dst_step * dy), dst_width*cn);
for (; dy < rmax_y; dy++)
{
int &iy = yoffsets[dy];
int i;
for (i = max(iy, last_eval + interp_y_len); i < min(iy + interp_y_len, src_height); i++, evalbuf_start = (evalbuf_start + 1) % interp_y_len)
hResize((ET*)(src + i * src_step), cn, xoffsets, xcoeffs, (fixedpoint*)linebuf + evalbuf_start*(dst_width * cn), min_x, max_x, dst_width);
evalbuf_start = (evalbuf_start + max(iy, src_height - interp_y_len) - max(last_eval, src_height - interp_y_len)) % interp_y_len;
last_eval = iy;
fixedpoint curcoeffs[interp_y_len];
for (i = 0; i < evalbuf_start; i++)
curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + interp_y_len + i];
for (; i < interp_y_len; i++)
curcoeffs[i] = ycoeffs[ dy*interp_y_len - evalbuf_start + i];
vlineResize<ET, FT, interp_y_len>((fixedpoint*)linebuf, dst_width*cn, curcoeffs, (ET*)(dst + dst_step * dy), dst_width*cn);
}
fixedpoint *endline = (fixedpoint*)linebuf;
if (last_eval + interp_y_len > src_height)
endline += dst_width*cn*((evalbuf_start + src_height - 1 - last_eval) % interp_y_len);
else
hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width);
for (; dy < range.end; dy++)
vlineSet<ET, FT>(endline, (ET*)(dst + dst_step * dy), dst_width*cn);
}
private:
const uchar* src;
size_t src_step;
int src_width, src_height;
uchar* dst;
size_t dst_step;
int dst_width, dst_height, cn;
int *xoffsets, *yoffsets;
fixedpoint *xcoeffs, *ycoeffs;
int min_x, max_x, min_y, max_y;
hResizeFunc hResize;
resize_bitExactInvoker(const resize_bitExactInvoker&);
resize_bitExactInvoker& operator=(const resize_bitExactInvoker&);
};
template <typename ET, typename interpolation>
void resize_bitExact(const uchar* src, size_t src_step, int src_width, int src_height,
uchar* dst, size_t dst_step, int dst_width, int dst_height,
int cn, double inv_scale_x, double inv_scale_y)
{
typedef typename fixedtype<ET, interpolation::needsign>::type fixedpoint;
void(*hResize)(ET* src, int cn, int *ofst, fixedpoint* m, fixedpoint* dst, int dst_min, int dst_max, int dst_width);
switch (cn)
{
case 1: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 1> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 1>; break;
case 2: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 2> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 2>; break;
case 3: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 3> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 3>; break;
case 4: hResize = src_width > interpolation::len ? hlineResizeCn<ET, fixedpoint, interpolation::len, true, 4> : hlineResizeCn<ET, fixedpoint, interpolation::len, false, 4>; break;
default: hResize = src_width > interpolation::len ? hlineResize<ET, fixedpoint, interpolation::len, true> : hlineResize<ET, fixedpoint, interpolation::len, false> ; break;
}
interpolation interp_x(inv_scale_x, src_width, dst_width);
interpolation interp_y(inv_scale_y, src_height, dst_height);
AutoBuffer<uchar> buf( dst_width * sizeof(int) +
dst_height * sizeof(int) +
dst_width * interp_x.len*sizeof(fixedpoint) +
dst_height * interp_y.len * sizeof(fixedpoint) );
int* xoffsets = (int*)((uchar*)buf);
int* yoffsets = xoffsets + dst_width;
fixedpoint* xcoeffs = (fixedpoint*)(yoffsets + dst_height);
fixedpoint* ycoeffs = xcoeffs + dst_width * interp_x.len;
int min_x, max_x, min_y, max_y;
for (int dx = 0; dx < dst_width; dx++)
interp_x.getCoeffs(dx, xoffsets+dx, xcoeffs+dx*interp_x.len);
interp_x.getMinMax(min_x, max_x);
for (int dy = 0; dy < dst_height; dy++)
interp_y.getCoeffs(dy, yoffsets+dy, ycoeffs+dy*interp_y.len);
interp_y.getMinMax(min_y, max_y);
resize_bitExactInvoker<ET, fixedpoint, interpolation::len> invoker(src, src_step, src_width, src_height, dst, dst_step, dst_width, dst_height, cn,
xoffsets, yoffsets, xcoeffs, ycoeffs, min_x, max_x, min_y, max_y, hResize);
Range range(0, dst_height);
parallel_for_(range, invoker, dst_width * dst_height / (double)(1 << 16));
}
typedef void(*be_resize_func)(const uchar* src, size_t src_step, int src_width, int src_height,
uchar* dst, size_t dst_step, int dst_width, int dst_height,
int cn, double inv_scale_x, double inv_scale_y);
}
namespace cv
{
/************** interpolation formulas and tables ***************/
const int INTER_RESIZE_COEF_BITS=11;
const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
static inline void interpolateCubic( float x, float* coeffs )
{
const float A = -0.75f;
coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}
static inline void interpolateLanczos4( float x, float* coeffs )
{
static const double s45 = 0.70710678118654752440084436210485;
static const double cs[][2]=
{{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
if( x < FLT_EPSILON )
{
for( int i = 0; i < 8; i++ )
coeffs[i] = 0;
coeffs[3] = 1;
return;
}
float sum = 0;
double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
for(int i = 0; i < 8; i++ )
{
double y = -(x+3-i)*CV_PI*0.25;
coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
sum += coeffs[i];
}
sum = 1.f/sum;
for(int i = 0; i < 8; i++ )
coeffs[i] *= sum;
}
template<typename ST, typename DT> struct Cast
{
typedef ST type1;
typedef DT rtype;
DT operator()(ST val) const { return saturate_cast<DT>(val); }
};
template<typename ST, typename DT, int bits> struct FixedPtCast
{
typedef ST type1;
typedef DT rtype;
enum { SHIFT = bits, DELTA = 1 << (bits-1) };
DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
};
/****************************************************************************************\
* Resize *
\****************************************************************************************/
class resizeNNInvoker :
public ParallelLoopBody
{
public:
resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
ify(_ify)
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int y, x, pix_size = (int)src.elemSize();
for( y = range.start; y < range.end; y++ )
{
uchar* D = dst.data + dst.step*y;
int sy = std::min(cvFloor(y*ify), ssize.height-1);
const uchar* S = src.ptr(sy);
switch( pix_size )
{
case 1:
for( x = 0; x <= dsize.width - 2; x += 2 )
{
uchar t0 = S[x_ofs[x]];
uchar t1 = S[x_ofs[x+1]];
D[x] = t0;
D[x+1] = t1;
}
for( ; x < dsize.width; x++ )
D[x] = S[x_ofs[x]];
break;
case 2:
for( x = 0; x < dsize.width; x++ )
*(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
break;
case 3:
for( x = 0; x < dsize.width; x++, D += 3 )
{
const uchar* _tS = S + x_ofs[x];
D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
}
break;
case 4:
for( x = 0; x < dsize.width; x++ )
*(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
break;
case 6:
for( x = 0; x < dsize.width; x++, D += 6 )
{
const ushort* _tS = (const ushort*)(S + x_ofs[x]);
ushort* _tD = (ushort*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
case 8:
for( x = 0; x < dsize.width; x++, D += 8 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1];
}
break;
case 12:
for( x = 0; x < dsize.width; x++, D += 12 )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
_tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
}
break;
default:
for( x = 0; x < dsize.width; x++, D += pix_size )
{
const int* _tS = (const int*)(S + x_ofs[x]);
int* _tD = (int*)D;
for( int k = 0; k < pix_size4; k++ )
_tD[k] = _tS[k];
}
}
}
}
private:
const Mat src;
Mat dst;
int* x_ofs, pix_size4;
double ify;
resizeNNInvoker(const resizeNNInvoker&);
resizeNNInvoker& operator=(const resizeNNInvoker&);
};
static void
resizeNN( const Mat& src, Mat& dst, double fx, double fy )
{
Size ssize = src.size(), dsize = dst.size();
AutoBuffer<int> _x_ofs(dsize.width);
int* x_ofs = _x_ofs;
int pix_size = (int)src.elemSize();
int pix_size4 = (int)(pix_size / sizeof(int));
double ifx = 1./fx, ify = 1./fy;
int x;
for( x = 0; x < dsize.width; x++ )
{
int sx = cvFloor(x*ifx);
x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
}
Range range(0, dsize.height);
#if CV_TRY_AVX2
if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4)))
{
if(pix_size == 2)
opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify);
else
opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify);
}
else
#endif
#if CV_TRY_SSE4_1
if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4)))
{
if(pix_size == 2)
opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
else
opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify);
}
else
#endif
{
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
}
struct VResizeNoVec
{
int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
};
struct HResizeNoVec
{
int operator()(const uchar**, uchar**, int, const int*,
const uchar*, int, int, int, int, int) const { return 0; }
};
#if CV_SSE2
struct VResizeLinearVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const int** src = (const int**)_src;
const short* beta = (const short*)_beta;
const int *S0 = src[0], *S1 = src[1];
int x = 0;
__m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
__m128i delta = _mm_set1_epi16(2);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 16; x += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_load_si128((const __m128i*)(S0 + x));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S1 + x));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
_mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
}
else
for( ; x <= width - 16; x += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
_mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
}
for( ; x < width - 4; x += 4 )
{
__m128i x0, y0;
x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
x0 = _mm_packs_epi32(x0, x0);
y0 = _mm_packs_epi32(y0, y0);
x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
x0 = _mm_packus_epi16(x0, x0);
*(int*)(dst + x) = _mm_cvtsi128_si32(x0);
}
return x;
}
};
template<int shiftval> struct VResizeLinearVec_32f16
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
ushort* dst = (ushort*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
__m128i preshift = _mm_set1_epi32(shiftval);
__m128i postshift = _mm_set1_epi16((short)shiftval);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 16; x += 16 )
{
__m128 x0, x1, y0, y1;
__m128i t0, t1, t2;
x0 = _mm_load_ps(S0 + x);
x1 = _mm_load_ps(S0 + x + 4);
y0 = _mm_load_ps(S1 + x);
y1 = _mm_load_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
x0 = _mm_load_ps(S0 + x + 8);
x1 = _mm_load_ps(S0 + x + 12);
y0 = _mm_load_ps(S1 + x + 8);
y1 = _mm_load_ps(S1 + x + 12);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
_mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
}
else
for( ; x <= width - 16; x += 16 )
{
__m128 x0, x1, y0, y1;
__m128i t0, t1, t2;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
x0 = _mm_loadu_ps(S0 + x + 8);
x1 = _mm_loadu_ps(S0 + x + 12);
y0 = _mm_loadu_ps(S1 + x + 8);
y1 = _mm_loadu_ps(S1 + x + 12);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
_mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
}
for( ; x < width - 4; x += 4 )
{
__m128 x0, y0;
__m128i t0;
x0 = _mm_loadu_ps(S0 + x);
y0 = _mm_loadu_ps(S1 + x);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
_mm_storel_epi64( (__m128i*)(dst + x), t0);
}
return x;
}
};
typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
struct VResizeLinearVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
float* dst = (float*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
if( (((size_t)S0|(size_t)S1)&15) == 0 )
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1;
x0 = _mm_load_ps(S0 + x);
x1 = _mm_load_ps(S0 + x + 4);
y0 = _mm_load_ps(S1 + x);
y1 = _mm_load_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
_mm_storeu_ps( dst + x, x0);
_mm_storeu_ps( dst + x + 4, x1);
}
else
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
_mm_storeu_ps( dst + x, x0);
_mm_storeu_ps( dst + x + 4, x1);
}
return x;
}
};
struct VResizeCubicVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const int** src = (const int**)_src;
const short* beta = (const short*)_beta;
const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
int x = 0;
float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
__m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
for( ; x <= width - 8; x += 8 )
{
__m128i x0, x1, y0, y1;
__m128 s0, s1, f0, f1;
x0 = _mm_load_si128((const __m128i*)(S0 + x));
x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S1 + x));
y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_load_si128((const __m128i*)(S2 + x));
x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
y0 = _mm_load_si128((const __m128i*)(S3 + x));
y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_cvtps_epi32(s0);
x1 = _mm_cvtps_epi32(s1);
x0 = _mm_packs_epi32(x0, x1);
_mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
}
else
for( ; x <= width - 8; x += 8 )
{
__m128i x0, x1, y0, y1;
__m128 s0, s1, f0, f1;
x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
s0 = _mm_add_ps(s0, f0);
s1 = _mm_add_ps(s1, f1);
x0 = _mm_cvtps_epi32(s0);
x1 = _mm_cvtps_epi32(s1);
x0 = _mm_packs_epi32(x0, x1);
_mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
}
return x;
}
};
template<int shiftval> struct VResizeCubicVec_32f16
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
ushort* dst = (ushort*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
__m128i preshift = _mm_set1_epi32(shiftval);
__m128i postshift = _mm_set1_epi16((short)shiftval);
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1, s0, s1;
__m128i t0, t1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
s0 = _mm_mul_ps(x0, b0);
s1 = _mm_mul_ps(x1, b0);
y0 = _mm_mul_ps(y0, b1);
y1 = _mm_mul_ps(y1, b1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
x0 = _mm_loadu_ps(S2 + x);
x1 = _mm_loadu_ps(S2 + x + 4);
y0 = _mm_loadu_ps(S3 + x);
y1 = _mm_loadu_ps(S3 + x + 4);
x0 = _mm_mul_ps(x0, b2);
x1 = _mm_mul_ps(x1, b2);
y0 = _mm_mul_ps(y0, b3);
y1 = _mm_mul_ps(y1, b3);
s0 = _mm_add_ps(s0, x0);
s1 = _mm_add_ps(s1, x1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
_mm_storeu_si128( (__m128i*)(dst + x), t0);
}
return x;
}
};
typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
struct VResizeCubicVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if( !checkHardwareSupport(CV_CPU_SSE) )
return 0;
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
float* dst = (float*)_dst;
int x = 0;
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
for( ; x <= width - 8; x += 8 )
{
__m128 x0, x1, y0, y1, s0, s1;
x0 = _mm_loadu_ps(S0 + x);
x1 = _mm_loadu_ps(S0 + x + 4);
y0 = _mm_loadu_ps(S1 + x);
y1 = _mm_loadu_ps(S1 + x + 4);
s0 = _mm_mul_ps(x0, b0);
s1 = _mm_mul_ps(x1, b0);
y0 = _mm_mul_ps(y0, b1);
y1 = _mm_mul_ps(y1, b1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
x0 = _mm_loadu_ps(S2 + x);
x1 = _mm_loadu_ps(S2 + x + 4);
y0 = _mm_loadu_ps(S3 + x);
y1 = _mm_loadu_ps(S3 + x + 4);
x0 = _mm_mul_ps(x0, b2);
x1 = _mm_mul_ps(x1, b2);
y0 = _mm_mul_ps(y0, b3);
y1 = _mm_mul_ps(y1, b3);
s0 = _mm_add_ps(s0, x0);
s1 = _mm_add_ps(s1, x1);
s0 = _mm_add_ps(s0, y0);
s1 = _mm_add_ps(s1, y1);
_mm_storeu_ps( dst + x, s0);
_mm_storeu_ps( dst + x + 4, s1);
}
return x;
}
};
#if CV_TRY_SSE4_1
struct VResizeLanczos4Vec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width);
else return 0;
}
};
#else
typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
#endif
struct VResizeLanczos4Vec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
short * dst = (short*)_dst;
int x = 0;
__m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
for( ; x <= width - 8; x += 8 )
{
__m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
__m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
__m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
__m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
}
return x;
}
};
struct VResizeLanczos4Vec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
float* dst = (float*)_dst;
int x = 0;
__m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
for( ; x <= width - 4; x += 4 )
{
__m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
_mm_storeu_ps(dst + x, v_dst);
}
return x;
}
};
#elif CV_NEON
struct VResizeLinearVec_32s8u
{
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
{
const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
const short* beta = (const short*)_beta;
int x = 0;
int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
for( ; x <= width - 16; x += 16)
{
int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
}
return x;
}
};
struct VResizeLinearVec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
ushort* dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
return x;
}
};
struct VResizeLinearVec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
short* dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
return x;
}
};
struct VResizeLinearVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1];
float* dst = (float*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
}
return x;
}
};
typedef VResizeNoVec VResizeCubicVec_32s8u;
struct VResizeCubicVec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
ushort* dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
return x;
}
};
struct VResizeCubicVec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
short* dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
return x;
}
};
struct VResizeCubicVec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
float* dst = (float*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
for( ; x <= width - 8; x += 8 )
{
vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x)));
vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4)));
}
return x;
}
};
struct VResizeLanczos4Vec_32f16u
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
ushort * dst = (ushort*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
v_b5, vld1q_f32(S5 + x + 4)),
v_b6, vld1q_f32(S6 + x + 4)),
v_b7, vld1q_f32(S7 + x + 4));
v_dst1 = vaddq_f32(v_dst0, v_dst1);
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
}
return x;
}
};
struct VResizeLanczos4Vec_32f16s
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
short * dst = (short*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 8; x += 8 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
v_b1, vld1q_f32(S1 + x + 4)),
v_b2, vld1q_f32(S2 + x + 4)),
v_b3, vld1q_f32(S3 + x + 4));
v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
v_b5, vld1q_f32(S5 + x + 4)),
v_b6, vld1q_f32(S6 + x + 4)),
v_b7, vld1q_f32(S7 + x + 4));
v_dst1 = vaddq_f32(v_dst0, v_dst1);
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
}
return x;
}
};
struct VResizeLanczos4Vec_32f
{
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
{
const float** src = (const float**)_src;
const float* beta = (const float*)_beta;
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
*S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
float* dst = (float*)_dst;
int x = 0;
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
for( ; x <= width - 4; x += 4 )
{
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
v_b1, vld1q_f32(S1 + x)),
v_b2, vld1q_f32(S2 + x)),
v_b3, vld1q_f32(S3 + x));
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
v_b5, vld1q_f32(S5 + x)),
v_b6, vld1q_f32(S6 + x)),
v_b7, vld1q_f32(S7 + x));
vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
}
return x;
}
};
#else
typedef VResizeNoVec VResizeLinearVec_32s8u;
typedef VResizeNoVec VResizeLinearVec_32f16u;
typedef VResizeNoVec VResizeLinearVec_32f16s;
typedef VResizeNoVec VResizeLinearVec_32f;
typedef VResizeNoVec VResizeCubicVec_32s8u;
typedef VResizeNoVec VResizeCubicVec_32f16u;
typedef VResizeNoVec VResizeCubicVec_32f16s;
typedef VResizeNoVec VResizeCubicVec_32f;
typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
typedef VResizeNoVec VResizeLanczos4Vec_32f;
#endif
typedef HResizeNoVec HResizeLinearVec_8u32s;
typedef HResizeNoVec HResizeLinearVec_16u32f;
typedef HResizeNoVec HResizeLinearVec_16s32f;
typedef HResizeNoVec HResizeLinearVec_32f;
typedef HResizeNoVec HResizeLinearVec_64f;
template<typename T, typename WT, typename AT, int ONE, class VecOp>
struct HResizeLinear
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const T** src, WT** dst, int count,
const int* xofs, const AT* alpha,
int swidth, int dwidth, int cn, int xmin, int xmax ) const
{
int dx, k;
VecOp vecOp;
int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
for( k = 0; k <= count - 2; k++ )
{
const T *S0 = src[k], *S1 = src[k+1];
WT *D0 = dst[k], *D1 = dst[k+1];
for( dx = dx0; dx < xmax; dx++ )
{
int sx = xofs[dx];
WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
D0[dx] = t0; D1[dx] = t1;
}
for( ; dx < dwidth; dx++ )
{
int sx = xofs[dx];
D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
}
}
for( ; k < count; k++ )
{
const T *S = src[k];
WT *D = dst[k];
for( dx = 0; dx < xmax; dx++ )
{
int sx = xofs[dx];
D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
}
for( ; dx < dwidth; dx++ )
D[dx] = WT(S[xofs[dx]]*ONE);
}
}
};
template<typename T, typename WT, typename AT, class CastOp, class VecOp>
struct VResizeLinear
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const WT** src, T* dst, const AT* beta, int width ) const
{
WT b0 = beta[0], b1 = beta[1];
const WT *S0 = src[0], *S1 = src[1];
CastOp castOp;
VecOp vecOp;
int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
WT t0, t1;
t0 = S0[x]*b0 + S1[x]*b1;
t1 = S0[x+1]*b0 + S1[x+1]*b1;
dst[x] = castOp(t0); dst[x+1] = castOp(t1);
t0 = S0[x+2]*b0 + S1[x+2]*b1;
t1 = S0[x+3]*b0 + S1[x+3]*b1;
dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
}
#endif
for( ; x < width; x++ )
dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
}
};
template<>
struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
{
typedef uchar value_type;
typedef int buf_type;
typedef short alpha_type;
void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
{
alpha_type b0 = beta[0], b1 = beta[1];
const buf_type *S0 = src[0], *S1 = src[1];
VResizeLinearVec_32s8u vecOp;
int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
}
#endif
for( ; x < width; x++ )
dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
}
};
template<typename T, typename WT, typename AT>
struct HResizeCubic
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const T** src, WT** dst, int count,
const int* xofs, const AT* alpha,
int swidth, int dwidth, int cn, int xmin, int xmax ) const
{
for( int k = 0; k < count; k++ )
{
const T *S = src[k];
WT *D = dst[k];
int dx = 0, limit = xmin;
for(;;)
{
for( ; dx < limit; dx++, alpha += 4 )
{
int j, sx = xofs[dx] - cn;
WT v = 0;
for( j = 0; j < 4; j++ )
{
int sxj = sx + j*cn;
if( (unsigned)sxj >= (unsigned)swidth )
{
while( sxj < 0 )
sxj += cn;
while( sxj >= swidth )
sxj -= cn;
}
v += S[sxj]*alpha[j];
}
D[dx] = v;
}
if( limit == dwidth )
break;
for( ; dx < xmax; dx++, alpha += 4 )
{
int sx = xofs[dx];
D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
}
limit = dwidth;
}
alpha -= dwidth*4;
}
}
};
template<typename T, typename WT, typename AT, class CastOp, class VecOp>
struct VResizeCubic
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const WT** src, T* dst, const AT* beta, int width ) const
{
WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
CastOp castOp;
VecOp vecOp;
int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
for( ; x < width; x++ )
dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
}
};
template<typename T, typename WT, typename AT>
struct HResizeLanczos4
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const T** src, WT** dst, int count,
const int* xofs, const AT* alpha,
int swidth, int dwidth, int cn, int xmin, int xmax ) const
{
for( int k = 0; k < count; k++ )
{
const T *S = src[k];
WT *D = dst[k];
int dx = 0, limit = xmin;
for(;;)
{
for( ; dx < limit; dx++, alpha += 8 )
{
int j, sx = xofs[dx] - cn*3;
WT v = 0;
for( j = 0; j < 8; j++ )
{
int sxj = sx + j*cn;
if( (unsigned)sxj >= (unsigned)swidth )
{
while( sxj < 0 )
sxj += cn;
while( sxj >= swidth )
sxj -= cn;
}
v += S[sxj]*alpha[j];
}
D[dx] = v;
}
if( limit == dwidth )
break;
for( ; dx < xmax; dx++, alpha += 8 )
{
int sx = xofs[dx];
D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
}
limit = dwidth;
}
alpha -= dwidth*8;
}
}
};
template<typename T, typename WT, typename AT, class CastOp, class VecOp>
struct VResizeLanczos4
{
typedef T value_type;
typedef WT buf_type;
typedef AT alpha_type;
void operator()(const WT** src, T* dst, const AT* beta, int width ) const
{
CastOp castOp;
VecOp vecOp;
int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
WT b = beta[0];
const WT* S = src[0];
WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
for( int k = 1; k < 8; k++ )
{
b = beta[k]; S = src[k];
s0 += S[x]*b; s1 += S[x+1]*b;
s2 += S[x+2]*b; s3 += S[x+3]*b;
}
dst[x] = castOp(s0); dst[x+1] = castOp(s1);
dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
}
#endif
for( ; x < width; x++ )
{
dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
}
}
};
static inline int clip(int x, int a, int b)
{
return x >= a ? (x < b ? x : b-1) : a;
}
static const int MAX_ESIZE=16;
template <typename HResize, typename VResize>
class resizeGeneric_Invoker :
public ParallelLoopBody
{
public:
typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT;
resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
int _ksize, int _xmin, int _xmax) :
ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
ksize(_ksize), xmin(_xmin), xmax(_xmax)
{
CV_Assert(ksize <= MAX_ESIZE);
}
virtual void operator() (const Range& range) const
{
int dy, cn = src.channels();
HResize hresize;
VResize vresize;
int bufstep = (int)alignSize(dsize.width, 16);
AutoBuffer<WT> _buffer(bufstep*ksize);
const T* srows[MAX_ESIZE]={0};
WT* rows[MAX_ESIZE]={0};
int prev_sy[MAX_ESIZE];
for(int k = 0; k < ksize; k++ )
{
prev_sy[k] = -1;
rows[k] = (WT*)_buffer + bufstep*k;
}
const AT* beta = _beta + ksize * range.start;
for( dy = range.start; dy < range.end; dy++, beta += ksize )
{
int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
for(int k = 0; k < ksize; k++ )
{
int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
for( k1 = std::max(k1, k); k1 < ksize; k1++ )
{
if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
{
if( k1 > k )
memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
break;
}
}
if( k1 == ksize )
k0 = std::min(k0, k); // remember the first row that needs to be computed
srows[k] = src.template ptr<T>(sy);
prev_sy[k] = sy;
}
if( k0 < ksize )
hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
ssize.width, dsize.width, cn, xmin, xmax );
vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
}
}
private:
Mat src;
Mat dst;
const int* xofs, *yofs;
const AT* alpha, *_beta;
Size ssize, dsize;
const int ksize, xmin, xmax;
resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
};
template<class HResize, class VResize>
static void resizeGeneric_( const Mat& src, Mat& dst,
const int* xofs, const void* _alpha,
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
{
typedef typename HResize::alpha_type AT;
const AT* beta = (const AT*)_beta;
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
ssize.width *= cn;
dsize.width *= cn;
xmin *= cn;
xmax *= cn;
// image resize is a separable operation. In case of not too strong
Range range(0, dsize.height);
resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
ssize, dsize, ksize, xmin, xmax);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
template <typename T, typename WT>
struct ResizeAreaFastNoVec
{
ResizeAreaFastNoVec(int, int) { }
ResizeAreaFastNoVec(int, int, int, int) { }
int operator() (const T*, T*, int) const
{ return 0; }
};
#if CV_NEON
class ResizeAreaFastVec_SIMD_8u
{
public:
ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
cn(_cn), step(_step)
{
}
int operator() (const uchar* S, uchar* D, int w) const
{
int dx = 0;
const uchar* S0 = S, * S1 = S0 + step;
uint16x8_t v_2 = vdupq_n_u16(2);
if (cn == 1)
{
for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
{
uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
{
uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
vst1_u8(D, vmovn_u16(v_dst));
}
}
return dx;
}
private:
int cn, step;
};
class ResizeAreaFastVec_SIMD_16u
{
public:
ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
cn(_cn), step(_step)
{
}
int operator() (const ushort * S, ushort * D, int w) const
{
int dx = 0;
const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
uint32x4_t v_2 = vdupq_n_u32(2);
if (cn == 1)
{
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
{
uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
}
}
return dx;
}
private:
int cn, step;
};
class ResizeAreaFastVec_SIMD_16s
{
public:
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
cn(_cn), step(_step)
{
}
int operator() (const short * S, short * D, int w) const
{
int dx = 0;
const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
int32x4_t v_2 = vdupq_n_s32(2);
if (cn == 1)
{
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
{
int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
}
}
return dx;
}
private:
int cn, step;
};
struct ResizeAreaFastVec_SIMD_32f
{
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
cn(_cn), step(_step)
{
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
}
int operator() (const float * S, float * D, int w) const
{
if (!fast_mode)
return 0;
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
int dx = 0;
float32x4_t v_025 = vdupq_n_f32(0.25f);
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
}
}
return dx;
}
private:
int cn;
bool fast_mode;
int step;
};
#elif CV_SSE2
class ResizeAreaFastVec_SIMD_8u
{
public:
ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const uchar* S, uchar* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const uchar* S0 = S;
const uchar* S1 = S0 + step;
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi16(2);
if (cn == 1)
{
__m128i masklow = _mm_set1_epi16(0x00ff);
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
__m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
else if (cn == 3)
for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
__m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
__m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
_mm_storel_epi64((__m128i*)(D+3), s0);
}
else
{
CV_Assert(cn == 4);
int v[] = { 0, 0, -1, -1 };
__m128i mask = _mm_loadu_si128((const __m128i*)v);
for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
__m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
__m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
__m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
__m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
__m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
__m128i res0 = _mm_srli_epi16(s0, 2);
s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
__m128i res1 = _mm_srli_epi16(s0, 2);
s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
_mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
_mm_storel_epi64((__m128i*)(D), s0);
}
}
return dx;
}
private:
int cn;
bool use_simd;
int step;
};
class ResizeAreaFastVec_SIMD_16u
{
public:
ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const ushort* S, ushort* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const ushort* S0 = (const ushort*)S;
const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
__m128i masklow = _mm_set1_epi32(0x0000ffff);
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi32(2);
#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
__m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
s0 = _mm_srli_epi32(s0, 2);
s0 = _mm_packus_epi32(s0, zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
else if (cn == 3)
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
__m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
__m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
__m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
else
{
CV_Assert(cn == 4);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
__m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
__m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
__m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
#undef _mm_packus_epi32
return dx;
}
private:
int cn;
int step;
bool use_simd;
};
class ResizeAreaFastVec_SIMD_16s
{
public:
ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
cn(_cn), step(_step)
{
use_simd = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const short* S, short* D, int w) const
{
if (!use_simd)
return 0;
int dx = 0;
const short* S0 = (const short*)S;
const short* S1 = (const short*)((const uchar*)(S) + step);
__m128i masklow = _mm_set1_epi32(0x0000ffff);
__m128i zero = _mm_setzero_si128();
__m128i delta2 = _mm_set1_epi32(2);
if (cn == 1)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
__m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
_mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
s0 = _mm_srai_epi32(s0, 2);
s0 = _mm_packs_epi32(s0, zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
else if (cn == 3)
for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
__m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
__m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
__m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
else
{
CV_Assert(cn == 4);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)S0);
__m128i r1 = _mm_loadu_si128((const __m128i*)S1);
__m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
__m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
__m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
__m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
__m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
__m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
_mm_storel_epi64((__m128i*)D, s0);
}
}
return dx;
}
private:
int cn;
int step;
bool use_simd;
};
struct ResizeAreaFastVec_SIMD_32f
{
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
cn(_cn), step(_step)
{
fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const float * S, float * D, int w) const
{
if (!fast_mode)
return 0;
const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
int dx = 0;
__m128 v_025 = _mm_set1_ps(0.25f);
if (cn == 1)
{
const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
__m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
_mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
__m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
_mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
}
}
else if (cn == 4)
{
for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
{
__m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
__m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
_mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
}
}
return dx;
}
private:
int cn;
bool fast_mode;
int step;
};
#else
typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
#endif
template<typename T, typename SIMDVecOp>
struct ResizeAreaFastVec
{
ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
}
int operator() (const T* S, T* D, int w) const
{
if (!fast_mode)
return 0;
const T* nextS = (const T*)((const uchar*)S + step);
int dx = vecOp(S, D, w);
if (cn == 1)
for( ; dx < w; ++dx )
{
int index = dx*2;
D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
}
else if (cn == 3)
for( ; dx < w; dx += 3 )
{
int index = dx*2;
D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
}
else
{
CV_Assert(cn == 4);
for( ; dx < w; dx += 4 )
{
int index = dx*2;
D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
}
}
return dx;
}
private:
int scale_x, scale_y;
int cn;
bool fast_mode;
int step;
SIMDVecOp vecOp;
};
template <typename T, typename WT, typename VecOp>
class resizeAreaFast_Invoker :
public ParallelLoopBody
{
public:
resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
{
}
virtual void operator() (const Range& range) const
{
Size ssize = src.size(), dsize = dst.size();
int cn = src.channels();
int area = scale_x*scale_y;
float scale = 1.f/(area);
int dwidth1 = (ssize.width/scale_x)*cn;
dsize.width *= cn;
ssize.width *= cn;
int dy, dx, k = 0;
VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
for( dy = range.start; dy < range.end; dy++ )
{
T* D = (T*)(dst.data + dst.step*dy);
int sy0 = dy*scale_y;
int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
if( sy0 >= ssize.height )
{
for( dx = 0; dx < dsize.width; dx++ )
D[dx] = 0;
continue;
}
dx = vop(src.template ptr<T>(sy0), D, w);
for( ; dx < w; dx++ )
{
const T* S = src.template ptr<T>(sy0) + xofs[dx];
WT sum = 0;
k = 0;
#if CV_ENABLE_UNROLLED
for( ; k <= area - 4; k += 4 )
sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
#endif
for( ; k < area; k++ )
sum += S[ofs[k]];
D[dx] = saturate_cast<T>(sum * scale);
}
for( ; dx < dsize.width; dx++ )
{
WT sum = 0;
int count = 0, sx0 = xofs[dx];
if( sx0 >= ssize.width )
D[dx] = 0;
for( int sy = 0; sy < scale_y; sy++ )
{
if( sy0 + sy >= ssize.height )
break;
const T* S = src.template ptr<T>(sy0 + sy) + sx0;
for( int sx = 0; sx < scale_x*cn; sx += cn )
{
if( sx0 + sx >= ssize.width )
break;
sum += S[sx];
count++;
}
}
D[dx] = saturate_cast<T>((float)sum/count);
}
}
}
private:
Mat src;
Mat dst;
int scale_x, scale_y;
const int *ofs, *xofs;
};
template<typename T, typename WT, typename VecOp>
static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
int scale_x, int scale_y )
{
Range range(0, dst.rows);
resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
scale_y, ofs, xofs);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
struct DecimateAlpha
{
int si, di;
float alpha;
};
template<typename T, typename WT> class ResizeArea_Invoker :
public ParallelLoopBody
{
public:
ResizeArea_Invoker( const Mat& _src, Mat& _dst,
const DecimateAlpha* _xtab, int _xtab_size,
const DecimateAlpha* _ytab, int _ytab_size,
const int* _tabofs )
{
src = &_src;
dst = &_dst;
xtab0 = _xtab;
xtab_size0 = _xtab_size;
ytab = _ytab;
ytab_size = _ytab_size;
tabofs = _tabofs;
}
virtual void operator() (const Range& range) const
{
Size dsize = dst->size();
int cn = dst->channels();
dsize.width *= cn;
AutoBuffer<WT> _buffer(dsize.width*2);
const DecimateAlpha* xtab = xtab0;
int xtab_size = xtab_size0;
WT *buf = _buffer, *sum = buf + dsize.width;
int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
for( dx = 0; dx < dsize.width; dx++ )
sum[dx] = (WT)0;
for( j = j_start; j < j_end; j++ )
{
WT beta = ytab[j].alpha;
int dy = ytab[j].di;
int sy = ytab[j].si;
{
const T* S = src->template ptr<T>(sy);
for( dx = 0; dx < dsize.width; dx++ )
buf[dx] = (WT)0;
if( cn == 1 )
for( k = 0; k < xtab_size; k++ )
{
int dxn = xtab[k].di;
WT alpha = xtab[k].alpha;
buf[dxn] += S[xtab[k].si]*alpha;
}
else if( cn == 2 )
for( k = 0; k < xtab_size; k++ )
{
int sxn = xtab[k].si;
int dxn = xtab[k].di;
WT alpha = xtab[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1;
}
else if( cn == 3 )
for( k = 0; k < xtab_size; k++ )
{
int sxn = xtab[k].si;
int dxn = xtab[k].di;
WT alpha = xtab[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
}
else if( cn == 4 )
{
for( k = 0; k < xtab_size; k++ )
{
int sxn = xtab[k].si;
int dxn = xtab[k].di;
WT alpha = xtab[k].alpha;
WT t0 = buf[dxn] + S[sxn]*alpha;
WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
buf[dxn] = t0; buf[dxn+1] = t1;
t0 = buf[dxn+2] + S[sxn+2]*alpha;
t1 = buf[dxn+3] + S[sxn+3]*alpha;
buf[dxn+2] = t0; buf[dxn+3] = t1;
}
}
else
{
for( k = 0; k < xtab_size; k++ )
{
int sxn = xtab[k].si;
int dxn = xtab[k].di;
WT alpha = xtab[k].alpha;
for( int c = 0; c < cn; c++ )
buf[dxn + c] += S[sxn + c]*alpha;
}
}
}
if( dy != prev_dy )
{
T* D = dst->template ptr<T>(prev_dy);
for( dx = 0; dx < dsize.width; dx++ )
{
D[dx] = saturate_cast<T>(sum[dx]);
sum[dx] = beta*buf[dx];
}
prev_dy = dy;
}
else
{
for( dx = 0; dx < dsize.width; dx++ )
sum[dx] += beta*buf[dx];
}
}
{
T* D = dst->template ptr<T>(prev_dy);
for( dx = 0; dx < dsize.width; dx++ )
D[dx] = saturate_cast<T>(sum[dx]);
}
}
private:
const Mat* src;
Mat* dst;
const DecimateAlpha* xtab0;
const DecimateAlpha* ytab;
int xtab_size0, ytab_size;
const int* tabofs;
};
template <typename T, typename WT>
static void resizeArea_( const Mat& src, Mat& dst,
const DecimateAlpha* xtab, int xtab_size,
const DecimateAlpha* ytab, int ytab_size,
const int* tabofs )
{
parallel_for_(Range(0, dst.rows),
ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
dst.total()/((double)(1 << 16)));
}
typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
const int* xofs, const void* alpha,
const int* yofs, const void* beta,
int xmin, int xmax, int ksize );
typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
const int* ofs, const int *xofs,
int scale_x, int scale_y );
typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
const DecimateAlpha* xtab, int xtab_size,
const DecimateAlpha* ytab, int ytab_size,
const int* yofs);
static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
{
int k = 0;
for(int dx = 0; dx < dsize; dx++ )
{
double fsx1 = dx * scale;
double fsx2 = fsx1 + scale;
double cellWidth = std::min(scale, ssize - fsx1);
int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
sx2 = std::min(sx2, ssize - 1);
sx1 = std::min(sx1, sx2);
if( sx1 - fsx1 > 1e-3 )
{
assert( k < ssize*2 );
tab[k].di = dx * cn;
tab[k].si = (sx1 - 1) * cn;
tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
}
for(int sx = sx1; sx < sx2; sx++ )
{
assert( k < ssize*2 );
tab[k].di = dx * cn;
tab[k].si = sx * cn;
tab[k++].alpha = float(1.0 / cellWidth);
}
if( fsx2 - sx2 > 1e-3 )
{
assert( k < ssize*2 );
tab[k].di = dx * cn;
tab[k].si = sx2 * cn;
tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
}
}
return k;
}
#ifdef HAVE_OPENCL
static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
float * const alpha_tab, int * const ofs_tab)
{
int k = 0, dx = 0;
for ( ; dx < dsize; dx++)
{
ofs_tab[dx] = k;
double fsx1 = dx * scale;
double fsx2 = fsx1 + scale;
double cellWidth = std::min(scale, ssize - fsx1);
int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
sx2 = std::min(sx2, ssize - 1);
sx1 = std::min(sx1, sx2);
if (sx1 - fsx1 > 1e-3)
{
map_tab[k] = sx1 - 1;
alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
}
for (int sx = sx1; sx < sx2; sx++)
{
map_tab[k] = sx;
alpha_tab[k++] = float(1.0 / cellWidth);
}
if (fsx2 - sx2 > 1e-3)
{
map_tab[k] = sx2;
alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
}
}
ofs_tab[dx] = k;
}
static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
double fx, double fy, int interpolation)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
std::abs(inv_fy - iscale_y) < DBL_EPSILON;
// in case of scale_x && scale_y is equal to 2
// INTER_AREA (fast) also is equal to INTER_LINEAR
if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
/*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
if( !(cn <= 4 &&
(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
(interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
return false;
UMat src = _src.getUMat();
_dst.create(dsize, type);
UMat dst = _dst.getUMat();
Size ssize = src.size();
ocl::Kernel k;
size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows };
ocl::Image2D srcImage;
// See if this could be done with a sampler. We stick with integer
// datatypes because the observed error is low.
bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
ocl::Image2D::isFormatSupported(depth, cn, true) &&
src.offset==0);
if (useSampler)
{
int wdepth = std::max(depth, CV_32S);
char buf[2][32];
cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
"-D convertToDT=%s -D cn=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
cn);
k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
if (k.empty())
useSampler = false;
else
{
// Convert the input into an OpenCL image type, using normalized channel data types
// and aliasing the UMat.
srcImage = ocl::Image2D(src, true, true);
k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
(float)inv_fx, (float)inv_fy);
}
}
if (interpolation == INTER_LINEAR && !useSampler)
{
char buf[2][32];
// integer path is slower because of CPU part, so it's disabled
if (depth == CV_8U && ((void)0, 0))
{
AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
float fxx, fyy;
int sx, sy;
for (int dx = 0; dx < dsize.width; dx++)
{
fxx = (float)((dx+0.5)*inv_fx - 0.5);
sx = cvFloor(fxx);
fxx -= sx;
if (sx < 0)
fxx = 0, sx = 0;
if (sx >= ssize.width-1)
fxx = 0, sx = ssize.width-1;
xofs[dx] = sx;
ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
}
for (int dy = 0; dy < dsize.height; dy++)
{
fyy = (float)((dy+0.5)*inv_fy - 0.5);
sy = cvFloor(fyy);
fyy -= sy;
yofs[dy] = sy;
ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
}
int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
UMat coeffs;
Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
cn, INTER_RESIZE_COEF_BITS));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
ocl::KernelArg::PtrReadOnly(coeffs));
}
else
{
int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
cn, INTER_RESIZE_COEF_BITS));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
(float)inv_fx, (float)inv_fy);
}
}
else if (interpolation == INTER_NEAREST)
{
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
(float)inv_fx, (float)inv_fy);
}
else if (interpolation == INTER_AREA)
{
int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
int wtype = CV_MAKE_TYPE(wdepth, cn);
char cvt[2][40];
String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
UMat alphaOcl, tabofsOcl, mapOcl;
UMat dmap, smap;
if (is_area_fast)
{
int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
" -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
if (k.empty())
return false;
}
else
{
buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
if (k.empty())
return false;
int xytab_size = (ssize.width + ssize.height) << 1;
int tabofs_size = dsize.height + dsize.width + 2;
AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
AutoBuffer<float> _xyalpha_tab(xytab_size);
int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
// loading precomputed arrays to GPU
Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
}
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
if (is_area_fast)
k.args(srcarg, dstarg);
else
k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
return k.run(2, globalsize, NULL, false);
}
return k.run(2, globalsize, 0, false);
}
#endif
#ifdef HAVE_IPP
#define IPP_RESIZE_PARALLEL 1
#ifdef HAVE_IPP_IW
class ipp_resizeParallel: public ParallelLoopBody
{
public:
ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeParallel()
{
}
void Init(IppiInterpolationType inter)
{
iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl);
m_ok = true;
}
virtual void operator() (const Range& range) const
{
CV_INSTRUMENT_REGION_IPP()
if(!m_ok)
return;
try
{
::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile);
}
catch(::ipp::IwException)
{
m_ok = false;
return;
}
}
private:
::ipp::IwiImage &m_src;
::ipp::IwiImage &m_dst;
mutable ::ipp::IwiResize iwiResize;
volatile bool &m_ok;
const ipp_resizeParallel& operator= (const ipp_resizeParallel&);
};
class ipp_resizeAffineParallel: public ParallelLoopBody
{
public:
ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok):
m_src(src), m_dst(dst), m_ok(ok) {}
~ipp_resizeAffineParallel()
{
}
void Init(IppiInterpolationType inter, double scaleX, double scaleY)
{
double shift = (inter == ippNearest)?-1e-10:-0.5;
double coeffs[2][3] = {
{scaleX, 0, shift+0.5*scaleX},
{0, scaleY, shift+0.5*scaleY}
};
iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl);
m_ok = true;
}
virtual void operator() (const Range& range) const
{
CV_INSTRUMENT_REGION_IPP()
if(!m_ok)
return;
try
{
::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start);
CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile);
}
catch(::ipp::IwException)
{
m_ok = false;
return;
}
}
private:
::ipp::IwiImage &m_src;
::ipp::IwiImage &m_dst;
mutable ::ipp::IwiWarpAffine iwiWarpAffine;
volatile bool &m_ok;
const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&);
};
#endif
static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height,
uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y,
int depth, int channels, int interpolation)
{
#ifdef HAVE_IPP_IW
CV_INSTRUMENT_REGION_IPP()
IppDataType ippDataType = ippiGetDataType(depth);
IppiInterpolationType ippInter = ippiGetInterpolation(interpolation);
if(ippInter < 0)
return false;
// Resize which doesn't match OpenCV exactly
if (!cv::ipp::useIPP_NE())
{
if (ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear))
return false;
}
if(ippInter != ippLinear && ippDataType == ipp64f)
return false;
#if IPP_VERSION_X100 < 201801
// Degradations on int^2 linear downscale
if (ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale
{
int scale_x = (int)(1 / inv_scale_x);
int scale_y = (int)(1 / inv_scale_y);
if (1 / inv_scale_x - scale_x < DBL_EPSILON && 1 / inv_scale_y - scale_y < DBL_EPSILON) // if integer
{
if (!(scale_x&(scale_x - 1)) && !(scale_y&(scale_y - 1))) // if power of 2
return false;
}
}
#endif
bool affine = false;
const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10;
double ex = fabs((double)dst_width / src_width - inv_scale_x) / inv_scale_x;
double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y;
// Use affine transform resize to allow sub-pixel accuracy
if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS)
affine = true;
// Affine doesn't support Lanczos and Super interpolations
if(affine && (ippInter == ippLanczos || ippInter == ippSuper))
return false;
try
{
::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step);
::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step);
bool ok;
int threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height)));
Range range(0, dst_height);
ipp_resizeParallel invokerGeneral(iwSrc, iwDst, ok);
ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok);
ParallelLoopBody *pInvoker = NULL;
if(affine)
{
pInvoker = &invokerAffine;
invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y);
}
else
{
pInvoker = &invokerGeneral;
invokerGeneral.Init(ippInter);
}
if(IPP_RESIZE_PARALLEL && threads > 1)
parallel_for_(range, *pInvoker, threads*4);
else
pInvoker->operator()(range);
if(!ok)
return false;
}
catch(::ipp::IwException)
{
return false;
}
return true;
#else
CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step);
CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth);
CV_UNUSED(channels); CV_UNUSED(interpolation);
return false;
#endif
}
#endif
//==================================================================================================
namespace hal {
void resize(int src_type,
const uchar * src_data, size_t src_step, int src_width, int src_height,
uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
double inv_scale_x, double inv_scale_y, int interpolation)
{
CV_INSTRUMENT_REGION()
CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
{
inv_scale_x = static_cast<double>(dst_width) / src_width;
inv_scale_y = static_cast<double>(dst_height) / src_height;
}
CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
saturate_cast<int>(src_height*inv_scale_y));
CV_Assert( dsize.area() > 0 );
CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation))
static ResizeFunc linear_tab[] =
{
resizeGeneric_<
HResizeLinear<uchar, int, short,
INTER_RESIZE_COEF_SCALE,
HResizeLinearVec_8u32s>,
VResizeLinear<uchar, int, short,
FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
VResizeLinearVec_32s8u> >,
0,
resizeGeneric_<
HResizeLinear<ushort, float, float, 1,
HResizeLinearVec_16u32f>,
VResizeLinear<ushort, float, float, Cast<float, ushort>,
VResizeLinearVec_32f16u> >,
resizeGeneric_<
HResizeLinear<short, float, float, 1,
HResizeLinearVec_16s32f>,
VResizeLinear<short, float, float, Cast<float, short>,
VResizeLinearVec_32f16s> >,
0,
resizeGeneric_<
HResizeLinear<float, float, float, 1,
HResizeLinearVec_32f>,
VResizeLinear<float, float, float, Cast<float, float>,
VResizeLinearVec_32f> >,
resizeGeneric_<
HResizeLinear<double, double, float, 1,
HResizeNoVec>,
VResizeLinear<double, double, float, Cast<double, double>,
VResizeNoVec> >,
0
};
static ResizeFunc cubic_tab[] =
{
resizeGeneric_<
HResizeCubic<uchar, int, short>,
VResizeCubic<uchar, int, short,
FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
VResizeCubicVec_32s8u> >,
0,
resizeGeneric_<
HResizeCubic<ushort, float, float>,
VResizeCubic<ushort, float, float, Cast<float, ushort>,
VResizeCubicVec_32f16u> >,
resizeGeneric_<
HResizeCubic<short, float, float>,
VResizeCubic<short, float, float, Cast<float, short>,
VResizeCubicVec_32f16s> >,
0,
resizeGeneric_<
HResizeCubic<float, float, float>,
VResizeCubic<float, float, float, Cast<float, float>,
VResizeCubicVec_32f> >,
resizeGeneric_<
HResizeCubic<double, double, float>,
VResizeCubic<double, double, float, Cast<double, double>,
VResizeNoVec> >,
0
};
static ResizeFunc lanczos4_tab[] =
{
resizeGeneric_<HResizeLanczos4<uchar, int, short>,
VResizeLanczos4<uchar, int, short,
FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
VResizeNoVec> >,
0,
resizeGeneric_<HResizeLanczos4<ushort, float, float>,
VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
VResizeLanczos4Vec_32f16u> >,
resizeGeneric_<HResizeLanczos4<short, float, float>,
VResizeLanczos4<short, float, float, Cast<float, short>,
VResizeLanczos4Vec_32f16s> >,
0,
resizeGeneric_<HResizeLanczos4<float, float, float>,
VResizeLanczos4<float, float, float, Cast<float, float>,
VResizeLanczos4Vec_32f> >,
resizeGeneric_<HResizeLanczos4<double, double, float>,
VResizeLanczos4<double, double, float, Cast<double, double>,
VResizeNoVec> >,
0
};
static ResizeAreaFastFunc areafast_tab[] =
{
resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
0,
resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
0,
resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
0
};
static ResizeAreaFunc area_tab[] =
{
resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
resizeArea_<short, float>, 0, resizeArea_<float, float>,
resizeArea_<double, double>, 0
};
static be_resize_func linear_exact_tab[] =
{
resize_bitExact<uchar, interpolationLinear<uchar> >,
resize_bitExact<schar, interpolationLinear<schar> >,
resize_bitExact<ushort, interpolationLinear<ushort> >,
resize_bitExact<short, interpolationLinear<short> >,
resize_bitExact<int, interpolationLinear<int> >,
0,
0,
0
};
double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
int iscale_x = saturate_cast<int>(scale_x);
int iscale_y = saturate_cast<int>(scale_y);
bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
std::abs(scale_y - iscale_y) < DBL_EPSILON;
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(dsize, src_type, dst_data, dst_step);
if (interpolation == INTER_LINEAR_EXACT)
{
// in case of inv_scale_x && inv_scale_y is equal to 0.5
// INTER_AREA (fast) is equal to bit exact INTER_LINEAR
if (is_area_fast && iscale_x == 2 && iscale_y == 2 && cn != 2)//Area resize implementation for 2-channel images isn't bit-exact
interpolation = INTER_AREA;
else
{
be_resize_func func = linear_exact_tab[depth];
CV_Assert(func != 0);
func(src_data, src_step, src_width, src_height,
dst_data, dst_step, dst_width, dst_height,
cn, inv_scale_x, inv_scale_y);
return;
}
}
if( interpolation == INTER_NEAREST )
{
resizeNN( src, dst, inv_scale_x, inv_scale_y );
return;
}
int k, sx, sy, dx, dy;
{
// in case of scale_x && scale_y is equal to 2
// INTER_AREA (fast) also is equal to INTER_LINEAR
if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
interpolation = INTER_AREA;
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation
if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
{
if( is_area_fast )
{
int area = iscale_x*iscale_y;
size_t srcstep = src_step / src.elemSize1();
AutoBuffer<int> _ofs(area + dsize.width*cn);
int* ofs = _ofs;
int* xofs = ofs + area;
ResizeAreaFastFunc func = areafast_tab[depth];
CV_Assert( func != 0 );
for( sy = 0, k = 0; sy < iscale_y; sy++ )
for( sx = 0; sx < iscale_x; sx++ )
ofs[k++] = (int)(sy*srcstep + sx*cn);
for( dx = 0; dx < dsize.width; dx++ )
{
int j = dx * cn;
sx = iscale_x * j;
for( k = 0; k < cn; k++ )
xofs[j + k] = sx + k;
}
func( src, dst, ofs, xofs, iscale_x, iscale_y );
return;
}
ResizeAreaFunc func = area_tab[depth];
CV_Assert( func != 0 && cn <= 4 );
AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
AutoBuffer<int> _tabofs(dsize.height + 1);
int* tabofs = _tabofs;
for( k = 0, dy = 0; k < ytab_size; k++ )
{
if( k == 0 || ytab[k].di != ytab[k-1].di )
{
assert( ytab[k].di == dy );
tabofs[dy++] = k;
}
}
tabofs[dy] = ytab_size;
func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
return;
}
}
int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
bool area_mode = interpolation == INTER_AREA;
bool fixpt = depth == CV_8U;
float fx, fy;
ResizeFunc func=0;
int ksize=0, ksize2;
if( interpolation == INTER_CUBIC )
ksize = 4, func = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ksize = 8, func = lanczos4_tab[depth];
else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
ksize = 2, func = linear_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
ksize2 = ksize/2;
CV_Assert( func != 0 );
AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
int* xofs = (int*)(uchar*)_buffer;
int* yofs = xofs + width;
float* alpha = (float*)(yofs + dsize.height);
short* ialpha = (short*)alpha;
float* beta = alpha + width*ksize;
short* ibeta = ialpha + width*ksize;
float cbuf[MAX_ESIZE] = {0};
for( dx = 0; dx < dsize.width; dx++ )
{
if( !area_mode )
{
fx = (float)((dx+0.5)*scale_x - 0.5);
sx = cvFloor(fx);
fx -= sx;
}
else
{
sx = cvFloor(dx*scale_x);
fx = (float)((dx+1) - (sx+1)*inv_scale_x);
fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
}
if( sx < ksize2-1 )
{
xmin = dx+1;
if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
fx = 0, sx = 0;
}
if( sx + ksize2 >= src_width )
{
xmax = std::min( xmax, dx );
if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
fx = 0, sx = src_width-1;
}
for( k = 0, sx *= cn; k < cn; k++ )
xofs[dx*cn + k] = sx + k;
if( interpolation == INTER_CUBIC )
interpolateCubic( fx, cbuf );
else if( interpolation == INTER_LANCZOS4 )
interpolateLanczos4( fx, cbuf );
else
{
cbuf[0] = 1.f - fx;
cbuf[1] = fx;
}
if( fixpt )
{
for( k = 0; k < ksize; k++ )
ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
for( ; k < cn*ksize; k++ )
ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
}
else
{
for( k = 0; k < ksize; k++ )
alpha[dx*cn*ksize + k] = cbuf[k];
for( ; k < cn*ksize; k++ )
alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
}
}
for( dy = 0; dy < dsize.height; dy++ )
{
if( !area_mode )
{
fy = (float)((dy+0.5)*scale_y - 0.5);
sy = cvFloor(fy);
fy -= sy;
}
else
{
sy = cvFloor(dy*scale_y);
fy = (float)((dy+1) - (sy+1)*inv_scale_y);
fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
}
yofs[dy] = sy;
if( interpolation == INTER_CUBIC )
interpolateCubic( fy, cbuf );
else if( interpolation == INTER_LANCZOS4 )
interpolateLanczos4( fy, cbuf );
else
{
cbuf[0] = 1.f - fy;
cbuf[1] = fy;
}
if( fixpt )
{
for( k = 0; k < ksize; k++ )
ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
}
else
{
for( k = 0; k < ksize; k++ )
beta[dy*ksize + k] = cbuf[k];
}
}
func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
}
} // cv::hal::
} // cv::
//==================================================================================================
void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
double inv_scale_x, double inv_scale_y, int interpolation )
{
CV_INSTRUMENT_REGION()
Size ssize = _src.size();
CV_Assert( ssize.width > 0 && ssize.height > 0 );
CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
if( dsize.area() == 0 )
{
dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
saturate_cast<int>(ssize.height*inv_scale_y));
CV_Assert( dsize.area() > 0 );
}
else
{
inv_scale_x = (double)dsize.width/ssize.width;
inv_scale_y = (double)dsize.height/ssize.height;
}
if (interpolation == INTER_LINEAR_EXACT && (_src.depth() == CV_32F || _src.depth() == CV_64F))
interpolation = INTER_LINEAR; // If depth isn't supported fallback to generic resize
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
Mat src = _src.getMat();
_dst.create(dsize, src.type());
Mat dst = _dst.getMat();
if (dsize == ssize)
{
// Source and destination are of same size. Use simple copy.
src.copyTo(dst);
return;
}
hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
}
CV_IMPL void
cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
{
cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
CV_Assert( src.type() == dst.type() );
cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
(double)dst.rows/src.rows, method );
}
/* End of file. */