@ -44,13 +44,11 @@ public:
std : : vector < int > all_ndims ;
std : : vector < std : : vector < int > > orig_shapes ;
std : : vector < std : : vector < size_t > > orig_steps ;
std : : vector < char * > ptrs ;
std : : vector < std : : vector < int > > shapes ;
std : : vector < std : : vector < size_t > > steps ;
std : : vector < size_t > elemsize ;
NaryEltwiseHelper ( ) {
}
NaryEltwiseHelper ( ) { }
void init ( const std : : vector < Mat > & inputs , const std : : vector < Mat > & outputs )
{
@ -59,7 +57,6 @@ public:
all_ndims . clear ( ) ;
orig_shapes . clear ( ) ;
orig_steps . clear ( ) ;
ptrs . clear ( ) ;
shapes . clear ( ) ;
steps . clear ( ) ;
elemsize . clear ( ) ;
@ -81,7 +78,6 @@ public:
shapes = std : : vector < std : : vector < int > > ( narrays , std : : vector < int > ( max_ndims , 0 ) ) ;
steps = std : : vector < std : : vector < size_t > > ( narrays , std : : vector < size_t > ( max_ndims , 0 ) ) ;
ptrs = std : : vector < char * > ( narrays , nullptr ) ;
for ( i = 0 ; i < = ninputs ; i + + ) {
all_ndims . push_back ( i = = 0 ? out_ndims : inp_ndims [ i - 1 ] ) ;
@ -183,6 +179,7 @@ public:
this - > shapes [ k ] [ i ] = 1 ;
}
}
return true ;
}
} ;
@ -288,7 +285,7 @@ public:
# ifdef HAVE_VULKAN
if ( backendId = = DNN_BACKEND_VKCOM )
return op = = OPERATION : : ADD | | op = = OPERATION : : PROD | | op = = OPERATION : : SUB | |
op = = OPERATION : : DIV ;
op = = OPERATION : : DIV ;
# endif
if ( backendId = = DNN_BACKEND_CUDA ) {
@ -333,8 +330,16 @@ public:
inputs_arr . getMatVector ( inputs ) ;
outputs_arr . getMatVector ( outputs ) ;
if ( op ! = OPERATION : : POW ) {
for ( size_t i = 0 ; i < inputs . size ( ) ; i + + ) {
if ( inputs [ i ] . depth ( ) ! = outputs [ 0 ] . depth ( ) ) {
CV_Error ( Error : : BadDepth , cv : : format ( " NaryEltwiseLayer: Data type mismatch, input %zu of type %d, output of type %d " , i , inputs [ i ] . depth ( ) , outputs [ 0 ] . depth ( ) ) ) ;
}
}
}
helper . init ( inputs , outputs ) ;
CV_Assert ( helper . prepare_for_broadcast_op ( ) ) ;
CV_CheckTrue ( helper . prepare_for_broadcast_op ( ) , " NaryEltwiseLayer: Preparation for broadcasting failed " ) ;
}
bool getMemoryShapes ( const std : : vector < MatShape > & inputs ,
@ -342,168 +347,234 @@ public:
std : : vector < MatShape > & outputs ,
std : : vector < MatShape > & internals ) const CV_OVERRIDE
{
if ( inputs . size ( ) = = 1 ) {
outputs . assign ( 1 , inputs . front ( ) ) ;
} else {
MatShape outShape = findCommonShape ( inputs ) ;
outputs . assign ( 1 , outShape ) ;
}
return false ;
}
template < typename T , typename Functor >
void binary_forward_impl (
int ndims , const std : : vector < int > & shape ,
void binary_forward_impl ( const Functor & op , int ndims , const std : : vector < int > & shape ,
const char * data1 , const std : : vector < size_t > & step1 ,
const char * data2 , const std : : vector < size_t > & step2 ,
char * data , const std : : vector < size_t > & step ,
const Functor & op )
{
char * data , const std : : vector < size_t > & step , size_t block_size ) {
assert ( ndims > = 2 ) ;
size_t dp1 = step1 [ ndims - 1 ] / sizeof ( T ) ;
size_t dp2 = step2 [ ndims - 1 ] / sizeof ( T ) ;
size_t dp = step [ ndims - 1 ] / sizeof ( T ) ;
int k , n1 = shape [ ndims - 1 ] , n2 = shape [ ndims - 2 ] ;
size_t plane_idx , nplanes = 1 ;
for ( k = 0 ; k < ndims - 2 ; k + + ) nplanes * = shape [ k ] ;
for ( plane_idx = 0 ; plane_idx < nplanes ; plane_idx + + ) {
size_t dp1 = step1 . back ( ) / sizeof ( T ) ;
size_t dp2 = step2 . back ( ) / sizeof ( T ) ;
size_t dp = step . back ( ) / sizeof ( T ) ;
int plane_size = shape . back ( ) ;
int nplanes = std : : accumulate ( shape . begin ( ) , shape . end ( ) - 1 , 1 , std : : multiplies < int > ( ) ) ;
if ( nplanes = = 1 ) { // parallelize within the plane
const T * ptr1 = ( const T * ) data1 ;
const T * ptr2 = ( const T * ) data2 ;
T * ptr = ( T * ) data ;
auto worker = [ & ] ( const Range & r ) {
if ( dp1 = = 1 & & dp2 = = 1 & & dp = = 1 ) {
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 0 & & dp = = 1 ) {
T x2 = * ptr2 ;
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , x2 ) ;
}
} else if ( dp1 = = 0 & & dp2 = = 1 & & dp = = 1 ) {
T x1 = * ptr1 ;
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( x1 , ptr2 [ i ] ) ;
}
} else {
for ( int i = r . start ; i < r . end ; i + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr + = dp ) {
* ptr = op ( * ptr1 , * ptr2 ) ;
}
}
} ;
double nstripes = plane_size * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , plane_size ) , worker , nstripes ) ;
} else { // parallelize across planes
auto worker = [ & ] ( const Range & r ) {
for ( int plane_idx = r . start ; plane_idx < r . end ; plane_idx + + ) {
const char * ptr1_ = data1 ;
const char * ptr2_ = data2 ;
char * ptr_ = data ;
size_t idx = plane_idx ;
for ( k = ndims - 3 ; k > = 0 ; k - - ) {
size_t next_idx = idx / shape [ k ] ;
int i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
ptr1_ + = i_k * step1 [ k ] ;
ptr2_ + = i_k * step2 [ k ] ;
ptr_ + = i_k * step [ k ] ;
for ( int k = ndims - 2 ; k > = 0 ; k - - ) {
size_t next_idx = idx / shape [ k ] ;
size_ t i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
ptr1_ + = i_k * step1 [ k ] ;
ptr2_ + = i_k * step2 [ k ] ;
ptr_ + = i_k * step [ k ] ;
idx = next_idx ;
}
for ( int i2 = 0 ; i2 < n2 ; i2 + + , ptr1_ + = step1 [ ndims - 2 ] ,
ptr2_ + = step2 [ ndims - 2 ] ,
ptr_ + = step [ ndims - 2 ] )
{
const T * ptr1 = ( const T * ) ptr1_ ;
const T * ptr2 = ( const T * ) ptr2_ ;
T * ptr = ( T * ) ptr_ ;
if ( dp1 = = 1 & & dp2 = = 1 & & dp = = 1 ) {
for ( int i1 = 0 ; i1 < n1 ; i1 + + )
ptr [ i1 ] = op ( ptr1 [ i1 ] , ptr2 [ i1 ] ) ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 0 & & dp = = 1 ) {
T x2 = * ptr2 ;
for ( int i1 = 0 ; i1 < n1 ; i1 + + )
ptr [ i1 ] = op ( ptr1 [ i1 ] , x2 ) ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , x2 ) ;
}
} else if ( dp1 = = 0 & & dp2 = = 1 & & dp = = 1 ) {
T x1 = * ptr1 ;
for ( int i1 = 0 ; i1 < n1 ; i1 + + )
ptr [ i1 ] = op ( x1 , ptr2 [ i1 ] ) ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( x1 , ptr2 [ i ] ) ;
}
} else {
for ( int i1 = 0 ; i1 < n1 ; i1 + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr + = dp )
for ( int i = 0 ; i < plane_size ; i + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr + = dp ) {
* ptr = op ( * ptr1 , * ptr2 ) ;
}
}
}
} ;
double nstripes = nplanes * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , nplanes ) , worker , nstripes ) ;
}
}
/*
Elementwise binary operator ( like + , - , x , / , etc . ) which takes two operands
*/
template < typename T , typename Functor >
void binary_forward ( const Functor & f , const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs )
{
void binary_forward ( const Functor & f , const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs , size_t block_size = 6e6 ) {
const Mat & a = inputs [ 0 ] ;
const Mat & b = inputs [ 1 ] ;
Mat & out = outputs [ 0 ] ;
CV_Assert ( helper . shapes . size ( ) = = 3 & & helper . steps . size ( ) = = 3 ) ;
binary_forward_impl < T , Functor > (
helper . max_ndims , helper . shapes [ 0 ] , a . ptr < char > ( ) , helper . steps [ 1 ] ,
b . ptr < char > ( ) , helper . steps [ 2 ] , out . ptr < char > ( ) , helper . steps [ 0 ] ,
f ) ;
binary_forward_impl < T , Functor > ( f , helper . max_ndims , helper . shapes [ 0 ] , a . ptr < char > ( ) , helper . steps [ 1 ] ,
b . ptr < char > ( ) , helper . steps [ 2 ] , out . ptr < char > ( ) , helper . steps [ 0 ] , block_size ) ;
}
template < typename T , typename Functor >
void nary_forward_impl (
const Functor & f , const T scale , int ninputs , int ndims , const std : : vector < int > & shape ,
const char * * inp , char * out ,
const std : : vector < std : : vector < size_t > > & steps , std : : vector < char * > & ptrs )
{
void nary_forward_impl ( const Functor & op , const T scale , int ninputs , int ndims , const std : : vector < int > & shape ,
const char * * inp , char * out , const std : : vector < std : : vector < size_t > > & steps , size_t block_size ) {
CV_Assert ( ndims > = 2 ) ;
size_t dp = steps [ 0 ] [ ndims - 1 ] / sizeof ( T ) ;
size_t dp1 = steps [ 1 ] [ ndims - 1 ] / sizeof ( T ) ;
size_t dp2 = steps [ 2 ] [ ndims - 1 ] / sizeof ( T ) ;
enum { BLOCK_SIZE = 1024 } ;
T blck [ BLOCK_SIZE ] ;
size_t dp = steps [ 0 ] . back ( ) / sizeof ( T ) ;
size_t dp1 = steps [ 1 ] . back ( ) / sizeof ( T ) ;
size_t dp2 = steps [ 2 ] . back ( ) / sizeof ( T ) ;
int k , i , di1 = 0 , n1 = shape [ ndims - 1 ] , n2 = shape [ ndims - 2 ] ;
int second = ninputs = = 1 ? 1 : 2 ;
size_t plane_idx , nplanes = 1 ;
for ( k = 0 ; k < ndims - 2 ; k + + ) nplanes * = shape [ k ] ;
int plane_size = shape . back ( ) ;
int nplanes = std : : accumulate ( shape . begin ( ) , shape . end ( ) - 1 , 1 , std : : multiplies < int > ( ) ) ;
for ( plane_idx = 0 ; plane_idx < nplanes ; plane_idx + + ) {
if ( nplanes = = 1 ) { // parallelize within the plane
AutoBuffer < char > buf_ptrs ( steps . size ( ) ) ;
auto ptrs = ( char * * ) buf_ptrs . data ( ) ;
ptrs [ 0 ] = out ;
for ( i = 0 ; i < ninputs ; i + + ) ptrs [ i + 1 ] = ( char * ) inp [ i ] ;
size_t idx = plane_idx ;
for ( k = ndims - 3 ; k > = 0 ; k - - ) {
size_t next_idx = idx / shape [ k ] ;
int i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
for ( i = 0 ; i < ninputs ; i + + )
ptrs [ i ] + = i_k * steps [ i ] [ k ] ;
idx = next_idx ;
for ( int i = 0 ; i < ninputs ; i + + ) {
ptrs [ i + 1 ] = ( char * ) inp [ i ] ;
}
const T * ptr1 = ( const T * ) ( ptrs [ 1 ] ) ;
const T * ptr2 = ( const T * ) ( ptrs [ 2 ] ) ;
T * ptr = ( T * ) ( ptrs [ 0 ] ) ;
auto worker = [ & ] ( const Range & r ) {
if ( dp = = 1 & & dp1 = = 1 & & dp2 = = 1 ) {
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] ) ;
}
for ( int j = 2 ; j < ninputs ; j + + ) {
int dpj = steps [ j + 1 ] . back ( ) ;
const T * ptrj = ( const T * ) ( ptrs [ j + 1 ] ) ;
if ( dpj = = 1 ) {
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = saturate_cast < T > ( op ( ptr [ i ] , ptrj [ i ] ) * scale ) ;
}
for ( int i2 = 0 ; i2 < n2 ; i2 + + )
{
const T * ptr1 = ( const T * ) ( ptrs [ 1 ] + steps [ 1 ] [ ndims - 2 ] * i2 ) ;
const T * ptr2 = ( const T * ) ( ptrs [ second ] + steps [ second ] [ ndims - 2 ] * i2 ) ;
T * ptr = ( T * ) ( ptrs [ 0 ] + steps [ 0 ] [ ndims - 2 ] * i2 ) ;
if ( ninputs < = 2 ) {
if ( dp1 = = 1 & & dp2 = = 1 ) {
for ( int i1 = 0 ; i1 < n1 ; i1 + + )
ptr [ i1 ] = saturate_cast < T > ( f ( ptr1 [ i1 ] , ptr2 [ i1 ] ) * scale ) ;
} else {
for ( int i1 = 0 ; i1 < n1 ; i1 + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr + = dp )
* ptr = saturate_cast < T > ( f ( * ptr1 , * ptr2 ) * scale ) ;
for ( int i = r . start ; i < r . end ; i + + , ptrj + = dpj ) {
ptr [ i ] = saturate_cast < T > ( op ( ptr [ i ] , * ptrj ) * scale ) ;
}
}
} else {
for ( int i1 = 0 ; i1 < n1 ; i1 + = di1 , ptr + = di1 ) {
di1 = BLOCK_SIZE < n1 - i1 ? BLOCK_SIZE : n1 - i1 ;
if ( dp1 = = 1 & & dp2 = = 1 ) {
for ( int j = 0 ; j < di1 ; j + + )
blck [ j ] = f ( ptr1 [ j ] , ptr2 [ j ] ) ;
ptr1 + = di1 ;
ptr2 + = di1 ;
} else {
for ( int j = 0 ; j < di1 ; j + + , ptr1 + = dp1 , ptr2 + = dp2 )
blck [ j ] = f ( * ptr1 , * ptr2 ) ;
}
for ( i = 2 ; i < ninputs ; i + + ) {
int dp_i = steps [ i + 1 ] [ ndims - 1 ] / sizeof ( T ) ;
const T * ptr_i = ( const T * ) ( ptrs [ i + 1 ] +
steps [ i + 1 ] [ ndims - 2 ] * i2 ) + i1 * dp_i ;
if ( dp_i = = 1 ) {
if ( i < ninputs - 1 ) {
for ( int j = 0 ; j < di1 ; j + + )
blck [ j ] = f ( blck [ j ] , ptr_i [ j ] ) ;
} else {
for ( int j = 0 ; j < di1 ; j + + )
ptr [ j ] = saturate_cast < T > ( f ( blck [ j ] , ptr_i [ j ] ) * scale ) ;
}
} else {
if ( i < ninputs - 1 ) {
for ( int j = 0 ; j < di1 ; j + + , ptr_i + = dp_i )
blck [ j ] = f ( blck [ j ] , * ptr_i ) ;
auto * tmp = ptr ;
for ( int i = r . start ; i < r . end ; i + + , ptr + = dp , ptr1 + = dp1 , ptr2 + = dp2 ) {
* ptr = op ( * ptr1 , * ptr2 ) ;
}
ptr = tmp ;
for ( int j = 2 ; j < ninputs ; j + + ) {
int dpj = steps [ j + 1 ] . back ( ) ;
const T * ptr_j = ( const T * ) ( ptrs [ j + 1 ] ) ;
for ( int i = r . start ; i < r . end ; i + + , ptr + = dp , ptr_j + = dpj ) {
* ptr = saturate_cast < T > ( op ( * ptr , * ptr_j ) * scale ) ;
}
}
}
} ;
double nstripes = plane_size * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , plane_size ) , worker , nstripes ) ;
} else { // parallelize across the plane
auto worker = [ & ] ( const Range & r ) {
AutoBuffer < char > buf_ptrs ( steps . size ( ) ) ;
auto ptrs = ( char * * ) buf_ptrs . data ( ) ;
for ( int plane_idx = r . start ; plane_idx < r . end ; plane_idx + + ) {
ptrs [ 0 ] = out ;
for ( int i = 0 ; i < ninputs ; i + + ) ptrs [ i + 1 ] = ( char * ) inp [ i ] ;
size_t idx = plane_idx ;
for ( int k = ndims - 2 ; k > = 0 ; k - - ) {
size_t next_idx = idx / shape [ k ] ;
int i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
for ( int i = 0 ; i < = ninputs ; i + + ) {
ptrs [ i ] + = i_k * steps [ i ] [ k ] ;
}
idx = next_idx ;
}
const T * ptr1 = ( const T * ) ( ptrs [ 1 ] ) ;
const T * ptr2 = ( const T * ) ( ptrs [ 2 ] ) ;
T * ptr = ( T * ) ( ptrs [ 0 ] ) ;
if ( dp = = 1 & & dp1 = = 1 & & dp2 = = 1 ) {
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = saturate_cast < T > ( op ( ptr1 [ i ] , ptr2 [ i ] ) * scale ) ;
}
for ( int j = 2 ; j < ninputs ; j + + ) {
int dpj = steps [ j + 1 ] . back ( ) ;
const T * ptrj = ( const T * ) ( ptrs [ j + 1 ] ) ;
if ( dpj = = 1 ) {
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr [ i ] , saturate_cast < T > ( ptrj [ i ] * scale ) ) ;
}
} else {
for ( int j = 0 ; j < di1 ; j + + , ptr_i + = dp_i )
ptr [ j ] = saturate_cast < T > ( f ( blck [ j ] , * ptr_i ) * scale ) ;
for ( int i = 0 ; i < plane_size ; i + + , ptrj + = dpj ) {
ptr [ i ] = op ( ptr [ i ] , saturate_cast < T > ( * ptrj * scale ) ) ;
}
}
}
} else {
auto * tmp = ptr ;
for ( int i = 0 ; i < plane_size ; i + + , ptr + = dp , ptr1 + = dp1 , ptr2 + = dp2 ) {
* ptr = saturate_cast < T > ( op ( * ptr1 , * ptr2 ) * scale ) ;
}
ptr = tmp ;
for ( int j = 2 ; j < ninputs ; j + + ) {
int dpj = steps [ j + 1 ] . back ( ) ;
const T * ptrj = ( const T * ) ( ptrs [ j + 1 ] ) ;
for ( int i = 0 ; i < plane_size ; i + + , ptr + = dp , ptrj + = dpj ) {
* ptr = op ( * ptr , saturate_cast < T > ( * ptrj * scale ) ) ;
}
}
}
}
} ;
double nstripes = nplanes * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , nplanes ) , worker , nstripes ) ;
}
}
/*
Elementwise nary operator ( like sum , mean , etc . ) which takes at least one operand
*/
template < typename T , typename Functor >
void nary_forward (
const Functor & f , T scale ,
const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs
)
{
void nary_forward ( const Functor & f , T scale ,
const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs ,
size_t block_size = 6e6 ) {
// collect all input info
std : : vector < const char * > v_inp ;
std : : transform ( inputs . begin ( ) , inputs . end ( ) , std : : back_inserter ( v_inp ) , [ ] ( const Mat & m ) { return m . template ptr < const char > ( ) ; } ) ;
@ -512,13 +583,14 @@ public:
// collect output info
char * out = outputs [ 0 ] . ptr < char > ( ) ;
nary_forward_impl < T > (
f , scale , helper . ninputs , helper . max_ndims , helper . shapes [ 0 ] , inp , out , helper . steps , helper . ptrs ) ;
nary_forward_impl < T , Functor > ( f , scale , helper . ninputs , helper . max_ndims , helper . shapes [ 0 ] , inp , out , helper . steps , block_size ) ;
}
/*
Elementwise ternary operator ( like where ) which takes three operands
*/
template < typename T , typename Functor >
void trinary_forward ( const Functor & f , const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs )
{
void ternary_forward ( const Functor & f , const std : : vector < Mat > & inputs , std : : vector < Mat > & outputs , size_t block_size = 6e6 ) {
const Mat & a = inputs [ 0 ] ;
const Mat & b = inputs [ 1 ] ;
const Mat & c = inputs [ 2 ] ;
@ -526,70 +598,113 @@ public:
CV_Assert ( helper . shapes . size ( ) = = 4 & & helper . steps . size ( ) = = 4 ) ;
trinary_forward_impl < T , Functor > (
helper . max_ndims , helper . shapes [ 0 ] , a . ptr < char > ( ) , helper . steps [ 1 ] , b . ptr < char > ( ) , helper . steps [ 2 ] ,
c . ptr < char > ( ) , helper . steps [ 3 ] , out . ptr < char > ( ) , helper . steps [ 0 ] ,
f ) ;
ternary_forward_impl < T , Functor > ( f , helper . max_ndims , helper . shapes [ 0 ] ,
a . ptr < char > ( ) , helper . steps [ 1 ] ,
b . ptr < char > ( ) , helper . steps [ 2 ] ,
c . ptr < char > ( ) , helper . steps [ 3 ] ,
out . ptr < char > ( ) , helper . steps [ 0 ] , block_size ) ;
}
template < typename T , typename Functor >
void tri nary_forward_impl (
int ndims , const std : : vector < int > & shape ,
void te rnary_forward_impl (
const Functor & op , int ndims , const std : : vector < int > & shape ,
const char * data1 , const std : : vector < size_t > & step1 ,
const char * data2 , const std : : vector < size_t > & step2 ,
const char * data3 , const std : : vector < size_t > & step3 ,
char * data , const std : : vector < size_t > & step ,
const Functor & op )
{
assert ( ndims > = 2 ) ;
size_t dp1 = step1 [ ndims - 1 ] / sizeof ( T ) ;
size_t dp2 = step2 [ ndims - 1 ] / sizeof ( T ) ;
size_t dp3 = step3 [ ndims - 1 ] / sizeof ( T ) ;
size_t dp = step [ ndims - 1 ] / sizeof ( T ) ;
int k , n1 = shape [ ndims - 1 ] , n2 = shape [ ndims - 2 ] ;
size_t plane_idx , nplanes = 1 ;
for ( k = 0 ; k < ndims - 2 ; k + + ) nplanes * = shape [ k ] ;
for ( plane_idx = 0 ; plane_idx < nplanes ; plane_idx + + )
{
char * data , const std : : vector < size_t > & step , size_t block_size ) {
CV_Assert ( ndims > = 2 ) ;
size_t dp1 = step1 . back ( ) / sizeof ( T ) ;
size_t dp2 = step2 . back ( ) / sizeof ( T ) ;
size_t dp3 = step3 . back ( ) / sizeof ( T ) ;
size_t dp = step . back ( ) / sizeof ( T ) ;
int plane_size = shape . back ( ) ;
int nplanes = std : : accumulate ( shape . begin ( ) , shape . end ( ) - 1 , 1 , std : : multiplies < int > ( ) ) ;
if ( nplanes = = 1 ) { // parallelize within the plane
const T * ptr1 = ( const T * ) data1 ;
const T * ptr2 = ( const T * ) data2 ;
const T * ptr3 = ( const T * ) data3 ;
T * ptr = ( T * ) data ;
auto worker = [ & ] ( const Range & r ) {
if ( dp1 = = 1 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] , ptr3 [ i ] ) ;
}
} else if ( dp1 = = 0 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
T x1 = * ptr1 ;
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( x1 , ptr2 [ i ] , ptr3 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 0 & & dp3 = = 1 & & dp = = 1 ) {
T x2 = * ptr2 ;
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , x2 , ptr3 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
T x3 = * ptr3 ;
for ( int i = r . start ; i < r . end ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] , x3 ) ;
}
} else {
for ( int i = r . start ; i < r . end ; i + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr3 + = dp3 , ptr + = dp ) {
* ptr = op ( * ptr1 , * ptr2 , * ptr3 ) ;
}
}
} ;
double nstripes = plane_size * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , plane_size ) , worker , nstripes ) ;
} else { // parallelize across planes
auto worker = [ & ] ( const Range & r ) {
for ( int plane_idx = r . start ; plane_idx < r . end ; plane_idx + + ) {
const char * ptr1_ = data1 ;
const char * ptr2_ = data2 ;
const char * ptr3_ = data3 ;
char * ptr_ = data ;
size_t idx = plane_idx ;
for ( k = ndims - 3 ; k > = 0 ; k - - )
for ( int k = ndims - 2 ; k > = 0 ; k - - )
{
size_t next_idx = idx / shape [ k ] ;
int i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
ptr1_ + = i_k * step1 [ k ] ;
ptr2_ + = i_k * step2 [ k ] ;
ptr3_ + = i_k * step3 [ k ] ;
ptr_ + = i_k * step [ k ] ;
size_t next_idx = idx / shape [ k ] ;
int i_k = ( int ) ( idx - next_idx * shape [ k ] ) ;
ptr1_ + = i_k * step1 [ k ] ;
ptr2_ + = i_k * step2 [ k ] ;
ptr3_ + = i_k * step3 [ k ] ;
ptr_ + = i_k * step [ k ] ;
idx = next_idx ;
}
for ( int i2 = 0 ; i2 < n2 ; i2 + + , ptr1_ + = step1 [ ndims - 2 ] ,
ptr2_ + = step2 [ ndims - 2 ] ,
ptr3_ + = step3 [ ndims - 2 ] ,
ptr_ + = step [ ndims - 2 ] )
{
const T * ptr1 = ( const T * ) ptr1_ ;
const T * ptr2 = ( const T * ) ptr2_ ;
const T * ptr3 = ( const T * ) ptr3_ ;
const T * ptr1 = ( const T * ) ptr1_ ;
const T * ptr2 = ( const T * ) ptr2_ ;
const T * ptr3 = ( const T * ) ptr3_ ;
T * ptr = ( T * ) ptr_ ;
if ( dp1 = = 1 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 )
{
for ( int i1 = 0 ; i1 < n1 ; i1 + + )
ptr [ i1 ] = op ( ptr1 [ i1 ] , ptr2 [ i1 ] , ptr3 [ i1 ] ) ;
if ( dp1 = = 1 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] , ptr3 [ i ] ) ;
}
else
{
for ( int i1 = 0 ; i1 < n1 ; i1 + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr3 + = dp3 , ptr + = dp )
} else if ( dp1 = = 0 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
T x1 = * ptr1 ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( x1 , ptr2 [ i ] , ptr3 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 0 & & dp3 = = 1 & & dp = = 1 ) {
T x2 = * ptr2 ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , x2 , ptr3 [ i ] ) ;
}
} else if ( dp1 = = 1 & & dp2 = = 1 & & dp3 = = 1 & & dp = = 1 ) {
T x3 = * ptr3 ;
for ( int i = 0 ; i < plane_size ; i + + ) {
ptr [ i ] = op ( ptr1 [ i ] , ptr2 [ i ] , x3 ) ;
}
} else {
for ( int i = 0 ; i < plane_size ; i + + , ptr1 + = dp1 , ptr2 + = dp2 , ptr3 + = dp3 , ptr + = dp ) {
* ptr = op ( * ptr1 , * ptr2 , * ptr3 ) ;
}
}
}
} ;
double nstripes = nplanes * ( 1.0 / double ( block_size ) ) ;
parallel_for_ ( Range ( 0 , nplanes ) , worker , nstripes ) ;
}
}
void forward ( InputArrayOfArrays inputs_arr , OutputArrayOfArrays outputs_arr , OutputArrayOfArrays internals_arr ) CV_OVERRIDE
@ -608,143 +723,147 @@ public:
inputs_arr . getMatVector ( inputs ) ;
outputs_arr . getMatVector ( outputs ) ;
// TODO: assert types
if ( inputs . size ( ) = = 1 ) {
inputs [ 0 ] . copyTo ( outputs [ 0 ] ) ;
return ;
}
typeDispatch ( outputs [ 0 ] . type ( ) , inputs . size ( ) , inputs , outputs ) ;
}
template < typename T , typename . . . Args >
inline void opDispatch ( size_t ninputs , Args & & . . . args )
{
switch ( op )
{
case OPERATION : : EQUAL :
{
if ( ninputs = = 2 ) { // Operators that take two operands
switch ( op ) {
case OPERATION : : AND : {
auto op_and = [ ] ( const uint8_t & a , const uint8_t & b ) { return a & b ; } ;
binary_forward < T > ( op_and , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : EQUAL : {
auto equal = [ ] ( const T & a , const T & b ) { return a = = b ; } ;
binary_forward < T > ( equal , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : GREATER :
{
case OPERATION : : GREATER : {
auto greater = [ ] ( const T & a , const T & b ) { return a > b ; } ;
binary_forward < T > ( greater , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : GREATER_EQUAL :
{
case OPERATION : : GREATER_EQUAL : {
auto greater_equal = [ ] ( const T & a , const T & b ) { return a > = b ; } ;
binary_forward < T > ( greater_equal , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : LESS :
{
case OPERATION : : LESS : {
auto less = [ ] ( const T & a , const T & b ) { return a < b ; } ;
binary_forward < T > ( less , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : LESS_EQUAL :
{
case OPERATION : : LESS_EQUAL : {
auto less_equal = [ ] ( const T & a , const T & b ) { return a < = b ; } ;
binary_forward < T > ( less_equal , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : POW :
{
case OPERATION : : OR : {
auto op_or = [ ] ( const uint8_t & a , const uint8_t & b ) { return a | b ; } ;
binary_forward < T > ( op_or , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : POW : {
auto pow = [ ] ( const T & a , const T & b ) { return std : : pow ( a , b ) ; } ;
binary_forward < T > ( pow , std : : forward < Args > ( args ) . . . ) ;
binary_forward < T > ( pow , std : : forward < Args > ( args ) . . . , 1e5 ) ;
break ;
}
case OPERATION : : BITSHIFT :
{
case OPERATION : : XOR : {
auto op_xor = [ ] ( const uint8_t & a , const uint8_t & b ) { return a ^ b ; } ;
binary_forward < T > ( op_xor , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : BITSHIFT : {
auto bitshift = [ ] ( const uint8_t & a , const uint8_t & b ) { return a < < b ; } ;
binary_forward < T > ( bitshift , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : MAX :
{
case OPERATION : : MAX : {
auto max = [ ] ( const T & a , const T & b ) { return std : : max ( a , b ) ; } ;
nary_forward < T > ( max , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
bi nary_forward< T > ( max , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : MEAN :
{
case OPERATION : : MEAN : {
auto mean = [ ] ( const T & a , const T & b ) { return ( a + b ) / T { 2 } ; } ;
nary_forward < T > ( mean , T { 1 } / ninputs , std : : forward < Args > ( args ) . . . ) ;
bi nary_forward< T > ( mean , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : MIN :
{
case OPERATION : : MIN : {
auto min = [ ] ( const T & a , const T & b ) { return std : : min ( a , b ) ; } ;
nary_forward < T > ( min , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
bi nary_forward< T > ( min , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : MOD :
{
case OPERATION : : MOD : {
auto mod = [ ] ( const T & a , const T & b ) { return static_cast < T > ( _mod ( int ( a ) , int ( b ) ) ) ; } ;
binary_forward < T > ( mod , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : FMOD :
{
case OPERATION : : FMOD : {
auto fmod = [ ] ( const T & a , const T & b ) { return std : : fmod ( a , b ) ; } ;
binary_forward < T > ( fmod , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : PROD :
{
case OPERATION : : PROD : {
auto prod = [ ] ( const T & a , const T & b ) { return a * b ; } ;
binary_forward < T > ( prod , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : SUB :
{
case OPERATION : : SUB : {
auto sub = [ ] ( const T & a , const T & b ) { return a - b ; } ;
binary_forward < T > ( sub , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : SUM :
{
auto sum = [ ] ( const T & a , const T & b ) { return a + b ; } ;
nary_forward < T > ( sum , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : ADD :
{
auto add = [ ] ( const T & a , const T & b ) { return a + b ; } ;
binary_forward < T > ( add , std : : forward < Args > ( args ) . . . ) ;
case OPERATION : : SUM : {
auto sum = [ ] ( const T & a , const T & b ) { return a + b ; } ;
binary_forward < T > ( sum , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : DIV :
{
case OPERATION : : DIV : {
auto div = [ ] ( const T & a , const T & b ) { return a / b ; } ;
binary_forward < T > ( div , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : AND :
default : CV_Error ( Error : : StsBadArg , " Unsupported operation " ) ;
}
} else if ( ninputs = = 3 & & op = = OPERATION : : WHERE ) { // Operators that take three operands
auto where = [ ] ( const T & a , const T & b , const T & c ) { return a ? b : c ; } ;
ternary_forward < T > ( where , std : : forward < Args > ( args ) . . . ) ;
} else { // Operators that can take multiple (>= 3) operands
switch ( op )
{
auto op_and = [ ] ( const uint8_t & a , const uint8_t & b ) { return a & b ; } ;
binary_forward < T > ( op_and , std : : forward < Args > ( args ) . . . ) ;
case OPERATION : : MAX : {
auto max = [ ] ( const T & a , const T & b ) { return std : : max ( a , b ) ; } ;
nary_forward < T > ( max , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : OR :
{
auto op_or = [ ] ( const uint8_t & a , const uint8_t & b ) { return a | b ; } ;
bi nary_forward< T > ( op_or , std : : forward < Args > ( args ) . . . ) ;
case OPERATION : : MEAN : {
// Sum up inputs and then calculate mean by scale = 1 / ninputs
auto sum = [ ] ( const T & a , const T & b ) { return a + b ; } ;
nary_forward < T > ( sum , T { 1 } / ninputs , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : XOR :
{
auto op_xor = [ ] ( const uint8_t & a , const uint8_t & b ) { return a ^ b ; } ;
binary_forward < T > ( op_xor , std : : forward < Args > ( args ) . . . ) ;
case OPERATION : : MIN : {
auto min = [ ] ( const T & a , const T & b ) { return std : : min ( a , b ) ; } ;
nary_forward < T > ( min , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
break ;
}
case OPERATION : : WHERE :
{
auto op_where = [ ] ( const T & a , const T & b , const T & c ) { return a ? b : c ; } ;
trinary_forward < T > ( op_where , std : : forward < Args > ( args ) . . . ) ;
case OPERATION : : SUM : {
auto sum = [ ] ( const T & a , const T & b ) { return a + b ; } ;
nary_forward < T > ( sum , T { 1 } , std : : forward < Args > ( args ) . . . ) ;
break ;
}
default :
CV_Error ( Error : : StsBadArg , " Unsupported operation. " ) ;
}
} ;
}
@ -811,6 +930,9 @@ public:
case OPERATION : : FMOD :
op_ = cuda4dnn : : EltwiseOpType : : FMOD ;
break ;
case OPERATION : : POW :
op_ = cuda4dnn : : EltwiseOpType : : POW ;
break ;
default : return Ptr < BackendNode > ( ) ; // return empty cuda_node if the EltwiseOpType is unsupported type.
} ;
@ -881,6 +1003,15 @@ public:
# ifdef HAVE_DNN_NGRAPH
virtual Ptr < BackendNode > initNgraph ( const std : : vector < Ptr < BackendWrapper > > & inputs , const std : : vector < Ptr < BackendNode > > & nodes ) CV_OVERRIDE
{
// In case only one input
if ( inputs . size ( ) = = 1 ) {
auto & ieInpNode = nodes [ 0 ] . dynamicCast < InfEngineNgraphNode > ( ) - > node ;
ngraph : : OutputVector inp { ieInpNode } ;
auto blank = std : : make_shared < ov : : op : : v0 : : Concat > ( inp , 0 ) ;
return Ptr < BackendNode > ( new InfEngineNgraphNode ( blank ) ) ;
}
// TODO: Support multiple (>=3) inputs
CV_Assert ( inputs . size ( ) = = 2 ) ;
auto & inp0 = nodes [ 0 ] . dynamicCast < InfEngineNgraphNode > ( ) - > node ;
auto & inp1 = nodes [ 1 ] . dynamicCast < InfEngineNgraphNode > ( ) - > node ;