@ -66,9 +66,28 @@ public:
DIV = 3
} op ;
std : : vector < float > coeffs ;
bool variableChannels ;
enum OutputChannelsMode
{
ELTWISE_CHANNNELS_SAME = 0 , //!< number of channels from inputs must be the same and equal to output's number of channels
ELTWISE_CHANNNELS_INPUT_0 , //!< number of channels from inputs may be different,
//!< output's number of channels is equal to number of channels of first input
//!< number of channels of other inputs should not be greater than number of channels of first input
ELTWISE_CHANNNELS_INPUT_0_TRUNCATE , //!< number of channels from inputs may be different,
//!< output's number of channels is equal to number of channels of first input
//!< there is restriction on number of channels of other inputs
//!< extra channels of other inputs is ignored
ELTWISE_CHANNNELS_USE_MAX , //!< number of channels from inputs may be different,
//!< output's number of channels is equal to maximal number of input channels
//!< @note supported operation: `SUM`
} channelsModeInput ;
mutable OutputChannelsMode channelsMode ; //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
mutable /*size_t*/ int outputChannels ;
EltwiseLayerImpl ( const LayerParams & params )
: outputChannels ( 0 )
{
setParamsFrom ( params ) ;
op = SUM ;
@ -97,6 +116,35 @@ public:
coeffs [ i ] = paramCoeff . get < float > ( i ) ;
}
}
channelsModeInput = ELTWISE_CHANNNELS_SAME ;
if ( params . has ( " output_channels_mode " ) )
{
String v = toLowerCase ( params . get < String > ( " output_channels_mode " ) ) ;
if ( v = = " same " )
{
channelsModeInput = ELTWISE_CHANNNELS_SAME ;
}
else if ( v = = " input_0 " )
{
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0 ;
}
else if ( v = = " input_0_truncate " )
{
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE ;
}
else if ( v = = " max_input_channels " )
{
channelsModeInput = ELTWISE_CHANNNELS_USE_MAX ;
if ( op ! = SUM )
CV_Error ( cv : : Error : : StsBadArg , " [ " + type + " ]:( " + name + " ) 'max' channels mode is limited to SUM operation only " ) ;
}
else
CV_Error ( cv : : Error : : StsBadArg , " [ " + type + " ]:( " + name + " ) unknown channels mode: \" " + v + " \" " ) ;
}
channelsMode = channelsModeInput ;
// TODO Must have checks for other unknown options
}
virtual bool supportBackend ( int backendId ) CV_OVERRIDE
@ -104,7 +152,7 @@ public:
return backendId = = DNN_BACKEND_OPENCV | |
backendId = = DNN_BACKEND_HALIDE | |
( ( ( ( backendId = = DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 & & ( preferableTarget ! = DNN_TARGET_OPENCL | | coeffs . empty ( ) ) )
| | backendId = = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ) & & ! variableChannels ) ) ;
| | backendId = = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ) & & channelsMode = = ELTWISE_CHANNNELS_SAME ) ) ;
}
bool getMemoryShapes ( const std : : vector < MatShape > & inputs ,
@ -119,212 +167,320 @@ public:
int dims = inputs [ 0 ] . size ( ) ;
// Number of channels in output shape is determined by the first input tensor.
bool variableChannels = false ;
int numChannels = inputs [ 0 ] [ 1 ] ;
for ( in t i = 1 ; i < inputs . size ( ) ; i + + )
for ( size_ t i = 1 ; i < inputs . size ( ) ; i + + )
{
CV_Assert ( inputs [ 0 ] [ 0 ] = = inputs [ i ] [ 0 ] ) ;
CV_Assert ( inputs [ 0 ] [ 0 ] = = inputs [ i ] [ 0 ] ) ; // batch sizes are equal
// It's allowed for channels axis to be different.
for ( int j = 2 ; j < dims ; j + + )
int input_channels = inputs [ i ] [ 1 ] ;
if ( numChannels ! = input_channels )
variableChannels = true ;
if ( channelsModeInput = = ELTWISE_CHANNNELS_SAME )
{
CV_Assert ( numChannels = = input_channels ) ;
}
else if ( channelsModeInput = = ELTWISE_CHANNNELS_INPUT_0 )
{
CV_Assert ( numChannels > = input_channels ) ;
}
else if ( channelsModeInput = = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE )
{
// nothing to check
}
else if ( channelsModeInput = = ELTWISE_CHANNNELS_USE_MAX )
{
numChannels = std : : max ( numChannels , input_channels ) ;
}
else
{
CV_Assert ( 0 & & " Internal error " ) ;
}
for ( size_t j = 2 ; j < dims ; j + + )
CV_Assert ( inputs [ 0 ] [ j ] = = inputs [ i ] [ j ] ) ;
}
channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME ;
outputChannels = numChannels ;
outputs . assign ( 1 , inputs [ 0 ] ) ;
outputs [ 0 ] [ 1 ] = numChannels ;
return false ;
}
void finalize ( InputArrayOfArrays inputs_arr , OutputArrayOfArrays ) CV_OVERRIDE
{
std : : vector < Mat > inputs ;
inputs_arr . getMatVector ( inputs ) ;
variableChannels = false ;
for ( int i = 1 ; i < inputs . size ( ) ; + + i )
{
if ( inputs [ i ] . size [ 1 ] ! = inputs [ 0 ] . size [ 1 ] )
{
variableChannels = true ;
break ;
}
}
}
class EltwiseInvoker : public ParallelLoopBody
{
public :
EltwiseLayerImpl & self ;
std : : vector < const Mat * > srcs ;
std : : vector < int > srcNumChannels ;
int nsrcs ;
Mat * dst ;
std : : vector < float > coeffs ;
EltwiseOp op ;
int nstripes ;
const ActivationLayer * activ ;
int channels ;
size_t planeSize ;
EltwiseInvoker ( ) : nsrcs ( 0 ) , dst ( 0 ) , op ( PROD ) , nstripes ( 0 ) , activ ( 0 ) , channels ( 0 ) , planeSize ( 0 ) { }
EltwiseInvoker ( EltwiseLayerImpl & self_ )
: self ( self_ )
, nsrcs ( 0 ) , dst ( 0 ) , nstripes ( 0 ) , activ ( 0 ) , channels ( 0 )
, planeSize ( 0 )
{ }
static void run ( const Mat * srcs , int nsrcs , Mat & dst ,
const std : : vector < float > & coeffs , EltwiseOp op ,
const ActivationLayer * activ , int nstripes )
public :
static void run ( EltwiseLayerImpl & self ,
const Mat * srcs , int nsrcs , Mat & dst ,
int nstripes )
{
const EltwiseOp op = self . op ;
CV_Check ( dst . dims , 1 < dst . dims & & dst . dims < = 5 , " " ) ; CV_CheckTypeEQ ( dst . type ( ) , CV_32FC1 , " " ) ; CV_Assert ( dst . isContinuous ( ) ) ;
CV_Assert ( coeffs . empty ( ) | | coeffs . size ( ) = = ( size_t ) nsrcs ) ;
CV_Assert ( self . coeffs . empty ( ) | | self . coeffs . size ( ) = = ( size_t ) nsrcs ) ;
CV_CheckGE ( nsrcs , 2 , " " ) ;
CV_Assert ( self . outputChannels = = dst . size [ 1 ] ) ;
EltwiseInvoker p ;
EltwiseInvoker p ( self ) ;
p . srcs . resize ( nsrcs ) ;
p . coeffs = coeffs ;
p . srcNumChannels . resize ( nsrcs ) ;
p . coeffs = self . coeffs ; // can be sorted
bool sortInputs = false ;
for ( int i = 0 ; i < nsrcs ; i + + )
{
p . srcs [ i ] = srcs + i ;
CV_Assert ( srcs [ i ] . type ( ) = = dst . type ( ) & &
srcs [ i ] . isContinuous ( ) ) ;
// Sort srcs and coefficients in the order by number of channels
for ( int j = i ; j > = 1 & & p . srcs [ j - 1 ] - > size [ 1 ] < p . srcs [ j ] - > size [ 1 ] ; j - - )
p . srcs [ i ] = & srcs [ i ] ;
CV_CheckEQ ( srcs [ i ] . dims , dst . dims , " " ) ;
CV_Assert ( srcs [ i ] . isContinuous ( ) ) ;
CV_Assert ( srcs [ i ] . type ( ) = = dst . type ( ) ) ;
p . srcNumChannels [ i ] = ( srcs [ i ] . dims > = 4 ) ? srcs [ i ] . size [ 1 ] : 1 ;
if ( self . channelsMode = = ELTWISE_CHANNNELS_SAME )
{
std : : swap ( p . srcs [ j - 1 ] , p . srcs [ j ] ) ;
if ( ! p . coeffs . empty ( ) )
std : : swap ( p . coeffs [ j - 1 ] , p . coeffs [ j ] ) ;
CV_Assert ( srcs [ i ] . size = = dst . size ) ;
}
else if ( self . channelsMode = = ELTWISE_CHANNNELS_INPUT_0 )
{
if ( i = = 0 )
CV_Assert ( srcs [ 0 ] . size = = dst . size ) ;
CV_Assert ( self . outputChannels > = p . srcNumChannels [ i ] ) ;
sortInputs = true ;
}
else if ( self . channelsMode = = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE )
{
if ( i = = 0 )
CV_Assert ( srcs [ 0 ] . size = = dst . size ) ;
sortInputs = true ;
}
else if ( self . channelsMode = = ELTWISE_CHANNNELS_USE_MAX )
{
CV_Assert ( op = = SUM ) ;
CV_Assert ( self . outputChannels > = p . srcNumChannels [ i ] ) ;
sortInputs = true ;
}
else
{
CV_Assert ( 0 & & " Internal error " ) ;
}
if ( sortInputs )
{
// Sort srcs and coefficients in the desc order by number of channels
for ( int j = i ; j > = 1 ; j - - )
{
if ( std : : min ( self . outputChannels , p . srcs [ j - 1 ] - > size [ 1 ] ) < std : : min ( self . outputChannels , p . srcs [ j ] - > size [ 1 ] ) )
{
std : : swap ( p . srcs [ j - 1 ] , p . srcs [ j ] ) ;
std : : swap ( p . srcNumChannels [ j - 1 ] , p . srcNumChannels [ j ] ) ;
if ( ! p . coeffs . empty ( ) )
std : : swap ( p . coeffs [ j - 1 ] , p . coeffs [ j ] ) ;
}
else
break ;
}
}
}
p . nsrcs = nsrcs ;
p . dst = & dst ;
p . op = op ;
p . nstripes = nstripes ;
p . channels = ( dst . dims > = 4 ? dst . size [ 1 ] : 1 ) ;
p . planeSize = dst . total ( dst . dims > = 4 ? 2 : 1 ) ;
CV_Assert ( dst . total ( ) = = dst . size [ 0 ] * p . channels * p . planeSize ) ;
CV_CheckEQ ( dst . total ( ) , dst . size [ 0 ] * p . channels * p . planeSize , " " ) ;
bool simpleCoeffs = true ;
if ( op = = SUM & & ! coeffs . empty ( ) )
if ( op = = SUM & & ! p . coeffs . empty ( ) )
{
CV_Assert ( coeffs . size ( ) = = ( size_t ) nsrcs ) ;
CV_CheckEQ ( p . coeffs . size ( ) , ( size_t ) nsrcs , " " ) ;
for ( size_t i = 0 ; i < coeffs . size ( ) ; i + + )
if ( coeffs [ i ] ! = 1 )
for ( size_t i = 0 ; i < p . coeffs . size ( ) ; i + + )
{
if ( p . coeffs [ i ] ! = 1 )
{
simpleCoeffs = false ;
break ;
}
}
}
if ( simpleCoeffs )
p . coeffs . clear ( ) ;
p . activ = activ ;
p . activ = self . activ . get ( ) ;
parallel_for_ ( Range ( 0 , nstripes ) , p , nstripes ) ;
}
void operator ( ) ( const Range & r ) const CV_OVERRIDE
{
const EltwiseOp op = self . op ;
size_t total = dst - > size [ 0 ] * planeSize ;
size_t stripeSize = ( total + nstripes - 1 ) / nstripes ;
size_t stripeStart = r . start * stripeSize ;
size_t stripeEnd = std : : min ( r . end * stripeSize , total ) ;
int c , j , k , n ;
const float * coeffsptr = ! coeffs . empty ( ) ? & coeffs [ 0 ] : 0 ;
float * dstptr0 = dst - > ptr < float > ( ) ;
int blockSize0 = 1 < < 12 , blockSize ;
int blockSize0 = 1 < < 12 ;
for ( size_t ofs = stripeStart ; ofs < stripeEnd ; ofs + = blockSize )
for ( size_t ofs = stripeStart ; ofs < stripeEnd ; )
{
int sampleIdx = ( int ) ( ofs / planeSize ) ;
int delta = ( int ) ofs - sampleIdx * planeSize ;
blockSize = std : : min ( blockSize0 , std : : min ( ( int ) ( stripeEnd - ofs ) , ( int ) planeSize - delta ) ) ;
int blockSize = std : : min ( blockSize0 , std : : min ( ( int ) ( stripeEnd - ofs ) , ( int ) planeSize - delta ) ) ;
if ( blockSize < = 0 )
break ;
ofs + = blockSize ;
for ( c = 0 ; c < channels ; c + + )
for ( int c = 0 ; c < channels ; c + + )
{
size_t globalDelta = delta + ( sampleIdx * channels + c ) * planeSize ;
const float * srcptr0 = srcs [ 0 ] - > ptr < float > ( ) + globalDelta ;
float * dstptr = dstptr0 + globalDelta ;
// This code assumes that srcs are sorted in descending order by channels.
for ( n = 1 ; n < nsrcs & & c < srcs [ n ] - > size [ 1 ] ; + + n ) { }
size_t dstIdx = delta + ( sampleIdx * channels + c ) * planeSize ;
float * dstptr = dstptr0 + dstIdx ;
if ( n = = 1 )
// process first two inputs
{
if ( ! coeffsptr )
const float * srcptr0 = srcs [ 0 ] - > ptr < float > ( ) + dstIdx ;
const int inputIdx = 1 ;
int src1_channels = srcNumChannels [ inputIdx ] ;
if ( c > = src1_channels )
{
for ( j = 0 ; j < blockSize ; j + + )
// no data from second input
if ( ! coeffsptr | | coeffsptr [ 0 ] = = 1.0f )
{
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] ;
}
}
else
{
dstptr [ j ] = srcptr0 [ j ] ;
float c0 = coeffsptr [ 0 ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = c0 * srcptr0 [ j ] ;
}
}
}
else
{
float c0 = coeffsptr [ 0 ] ;
for ( j = 0 ; j < blockSize ; j + + )
size_t srcIdx = delta + ( sampleIdx * src1_channels + c ) * planeSize ;
const float * srcptrI = srcs [ inputIdx ] - > ptr < float > ( ) + srcIdx ;
if ( op = = PROD )
{
dstptr [ j ] = c0 * srcptr0 [ j ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] * srcptrI [ j ] ;
}
}
}
}
else if ( op = = PROD )
{
for ( k = 1 ; k < n ; k + + )
{
const float * srcptr1 = srcs [ k ] - > ptr < float > ( ) + globalDelta ;
for ( j = 0 ; j < blockSize ; j + + )
else if ( op = = DIV )
{
dstptr [ j ] = srcptr0 [ j ] * srcptr1 [ j ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] / srcptrI [ j ] ;
}
}
srcptr0 = ( const float * ) dstptr ;
else if ( op = = MAX )
{
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = std : : max ( srcptr0 [ j ] , srcptrI [ j ] ) ;
}
}
else if ( op = = SUM )
{
if ( ! coeffsptr | | ( coeffsptr [ 0 ] = = 1.0f & & coeffsptr [ 1 ] = = 1.0f ) )
{
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] + srcptrI [ j ] ;
}
}
else
{
float c0 = coeffsptr [ 0 ] ;
float c1 = coeffsptr [ 1 ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = c0 * srcptr0 [ j ] + c1 * srcptrI [ j ] ;
}
}
}
else
CV_Error ( Error : : StsInternal , " " ) ;
}
}
else if ( op = = DIV )
// aggregate other inputs (3+)
for ( size_t inputIdx = 2 ; inputIdx < nsrcs ; inputIdx + + )
{
for ( k = 1 ; k < n ; k + + )
int srcI_channels = srcNumChannels [ inputIdx ] ;
if ( c > = srcI_channels )
continue ; // no data from second input
size_t srcIdx = delta + ( sampleIdx * srcI_channels + c ) * planeSize ;
const float * srcptrI = srcs [ inputIdx ] - > ptr < float > ( ) + srcIdx ;
if ( op = = PROD )
{
const float * srcptr1 = srcs [ k ] - > ptr < float > ( ) + globalDelta ;
for ( j = 0 ; j < blockSize ; j + + )
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] / srcptr1 [ j ] ;
dstptr [ j ] * = srcptrI [ j ] ;
}
srcptr0 = ( const float * ) dstptr ;
}
}
else if ( op = = MAX )
{
for ( k = 1 ; k < n ; k + + )
else if ( op = = DIV )
{
const float * srcptr1 = srcs [ k ] - > ptr < float > ( ) + globalDelta ;
for ( j = 0 ; j < blockSize ; j + + )
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = std : : max ( srcptr0 [ j ] , srcptr1 [ j ] ) ;
dstptr [ j ] / = srcptrI [ j ] ;
}
srcptr0 = ( const float * ) dstptr ;
}
}
else if ( ! coeffsptr )
{
for ( k = 1 ; k < n ; k + + )
else if ( op = = MAX )
{
const float * srcptr1 = srcs [ k ] - > ptr < float > ( ) + globalDelta ;
for ( j = 0 ; j < blockSize ; j + + )
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] = srcptr0 [ j ] + srcptr1 [ j ] ;
dstptr [ j ] = std : : max ( dstptr [ j ] , srcptrI [ j ] ) ;
}
srcptr0 = ( const float * ) dstptr ;
}
}
else
{
float c0 = coeffsptr [ 0 ] ;
for ( k = 1 ; k < n ; k + + )
else if ( op = = SUM )
{
const float * srcptr1 = srcs [ k ] - > ptr < float > ( ) + globalDelta ;
float c1 = coeffsptr [ k ] ;
for ( j = 0 ; j < blockSize ; j + + )
if ( ! coeffsptr | | coeffsptr [ inputIdx ] = = 1.0f )
{
dstptr [ j ] = c0 * srcptr0 [ j ] + c1 * srcptr1 [ j ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] + = srcptrI [ j ] ;
}
}
else
{
float cI = coeffsptr [ inputIdx ] ;
for ( int j = 0 ; j < blockSize ; j + + )
{
dstptr [ j ] + = cI * srcptrI [ j ] ;
}
}
srcptr0 = ( const float * ) dstptr ;
c0 = 1 ;
}
else
CV_Error ( Error : : StsInternal , " " ) ;
}
}
@ -343,7 +499,7 @@ public:
std : : vector < UMat > inputs ;
std : : vector < UMat > outputs ;
if ( ( inputs_ . depth ( ) = = CV_16S & & op ! = SUM ) | | variableChannels )
if ( ( inputs_ . depth ( ) = = CV_16S & & op ! = SUM ) | | ( channelsMode ! = ELTWISE_CHANNNELS_SAME ) )
return false ;
inputs_ . getUMatVector ( inputs ) ;
@ -446,8 +602,9 @@ public:
CV_Assert ( outputs . size ( ) = = 1 ) ;
const int nstripes = getNumThreads ( ) ;
EltwiseInvoker : : run ( & inputs [ 0 ] , ( int ) inputs . size ( ) , outputs [ 0 ] ,
coeffs , op , activ . get ( ) , nstripes ) ;
EltwiseInvoker : : run ( * this ,
& inputs [ 0 ] , ( int ) inputs . size ( ) , outputs [ 0 ] ,
nstripes ) ;
}
virtual Ptr < BackendNode > initHalide ( const std : : vector < Ptr < BackendWrapper > > & input ) CV_OVERRIDE
@ -558,6 +715,7 @@ public:
CV_UNUSED ( outputs ) ; // suppress unused variable warning
CV_Assert ( inputs . size ( ) ) ;
// FIXIT: handle inputs with different number of channels
long flops = inputs . size ( ) * total ( inputs [ 0 ] ) ;
return flops ;