update convolution opencl kernels in dnn module (#11762)

* optimize ocl kernel enqueue in fc layer

Signed-off-by: Li Peng <peng.li@intel.com>

* use CV_LOG_INFO in convolution auto tuning

Signed-off-by: Li Peng <peng.li@intel.com>

* update convolution IDLF kernel

extend parameter tuning range, also cleanup
ocl kernel implementation

Signed-off-by: Li Peng <peng.li@intel.com>

* update in-memory convolution cache config

fp16 and fp32 cache config are stored separately

Signed-off-by: Li Peng <peng.li@intel.com>
pull/11831/head^2
Li, Peng 7 years ago committed by Vadim Pisarevsky
parent a2bc075924
commit ab8022f74e
  1. 4
      modules/dnn/src/layers/fully_connected_layer.cpp
  2. 1166
      modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
  3. 148
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  4. 181
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -310,7 +310,6 @@ public:
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
}
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
for (size_t i = 0; i < inputs.size(); i++)
{
MatShape inshape, outshape;
@ -320,7 +319,6 @@ public:
UMat srcMat, dstMat;
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
@ -332,6 +330,7 @@ public:
if (!use_half && bias && (outerSize > 1))
{
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
}
@ -354,6 +353,7 @@ public:
if (bias)
{
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
}

File diff suppressed because it is too large Load Diff

@ -55,6 +55,7 @@
#include "../include/math_functions.hpp"
#include "../include/default_kernel_config.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/utils/logger.hpp"
#if defined WIN32 || defined _WIN32
#include <windows.h>
@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
{
CV_Assert(defaultConfigLoaded == false);
CV_Assert(kernelConfigMap.empty());
const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
/* fp32 config */
size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
sizeof(default_kernel_config_intel_fp32[0]) / 2;
for (size_t i = 0; i < numConfigs; i++)
{
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i];
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
if (!cache_path.empty())
{
std::string cacheFile = cache_path + sanitize(key);
@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
}
std::pair<std::string, std::string> entry(
key,
default_kernel_config_intel[2 * i + 1]);
default_kernel_config_intel_fp32[2 * i + 1]);
kernelConfigMap.insert(entry);
}
/* fp16 config */
numConfigs = sizeof(default_kernel_config_intel_fp16) /
sizeof(default_kernel_config_intel_fp16[0]) / 2;
for (size_t i = 0; i < numConfigs; i++)
{
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
if (!cache_path.empty())
{
std::string cacheFile = cache_path + sanitize(key);
std::ifstream cachedKernel(cacheFile.c_str());
if (cachedKernel)
continue; // external configuration found, skip builtin
}
std::pair<std::string, std::string> entry(
key,
default_kernel_config_intel_fp16[2 * i + 1]);
kernelConfigMap.insert(entry);
}
defaultConfigLoaded = true;
}
@ -311,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
// options
options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
options_ << " -cl-mad-enable";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
// defs
int32_t output_width = output_w_;
int32_t output_height = output_h_;
int32_t output_block_width = blockM;
int32_t output_block_height = blockK;
const int32_t last_block_width = (output_width % output_block_width == 0) ?
output_block_width : output_width % output_block_width;
const int32_t last_block_height = (output_height % output_block_height == 0) ?
output_block_height : output_height % output_block_height;
int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
int tile_y_stride = (4 * simd_size) / tile_x;
int invec_size = divUp(tile_y, tile_y_stride);
int invec_size = tile_y;
addDef("SIMD_SIZE", simd_size);
addDef("filter_qualifier", "__global");
addDef("OUT_BLOCK_WIDTH", output_block_width);
addDef("OUT_BLOCK_HEIGHT", output_block_height);
addDef("LAST_BLOCK_WIDTH", last_block_width);
addDef("LAST_BLOCK_HEIGHT", last_block_height);
addDef("INPUT_DEPTH", channels_ / group_);
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
addDef("NUM_FILTERS", M_);
addDef("TILE_X", tile_x);
addDef("TILE_Y", tile_y);
addDef("TILE_Y_STRIDE", tile_y_stride);
addDef("INVEC_SIZE", invec_size);
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_);
addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
addDef("INPUT_PITCH", (width_ * height_));
addDef("OUTPUT_PITCH", (output_w_ * output_h_));
addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
addDef("INPUT_WIDTH", width_);
addDef("INPUT_HEIGHT", height_);
addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@ -567,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
return;
}
#define dbg
#ifdef dbg
#define dbgPrint(x) (x)
#else
#define dbgPrint(x)
#endif
// For large enough input size, we do not need to tune kernels for different
// size. The reason is with large input size, there will be enough work items
// to feed al the EUs.
@ -584,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::generateKey()
{
std::string precision = (use_half_) ? "FP16" : "FP32";
std::stringstream keyBuilder;
// FIXME: to support fuse?
keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
@ -597,7 +613,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "num" << num_ << "_"
<< "M" << M_ << "_"
<< "activ" << fused_activ_ << "_"
<< "eltwise" << fused_eltwise_;
<< "eltwise" << fused_eltwise_ << "_"
<< precision;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
<< "_" << blockHeight
<< "_" << blockDepth;
if (!use_half_)
keyBuilder << "_float";
else
keyBuilder << "_half";
return keyBuilder.str();
}
@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
cv::ocl::Timer timer(queue);
timer.start();
bool res = true;;
dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
tuned_ = true;
int loop_cnt = 4;
for (int i = 0; i < loop_cnt; i++) {
@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
}
float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
#ifdef dbg
double out_w = output_w_;
double out_h = output_h_;
double out_z = M_;
@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
double k_h = kernel_h_;
double k_z = channels_;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9)
<< std::endl;
std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))
<< std::endl;
#if 0
std::cout << "Estimated utilization: " <<
((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
<< std::endl;
#endif
#endif
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
return elapsedTime;
}
@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n",
n, g, out_ch, h, w, data[offset], verify_data[offset]));
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
<< " out_ch " << out_ch << " h " << h << " w " << w
<< " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1;
goto out;
}
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n",
n, g, out_ch, h, w, data[offset], verify_data[offset]));
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
<< " out_ch " << out_ch << " h " << h << " w " << w
<< " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1;
goto out;
}
@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
return;
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
int tile_x = alignSize(actual_tile_x, 4);
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
if (tile_x > (4 * simd_size))
return;
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
int tile_x = alignSize(actual_tile_x, simd_size);
if (tile_x > simd_size)
return;
int tile_y_stride = (4 * simd_size) / tile_x;
int invec_size = divUp(tile_y, tile_y_stride);
if (invec_size > 4)
if (blockM * blockK > block_size_max)
return;
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
for (uint32_t height = height_max; height > 0; height--)
{
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
if (tunerItems.size() >= 8 && height == 2)
break;
}
if (tunerItems.size() >= 12 && width == 2)
break;
}
}
}
@ -1690,10 +1683,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
if (kernelQueue[x]->tested == false) {
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
if (verified == false) {
dbgPrint(std::cout << "Kernel "
<< kernelQueue[x]->kernelName
<< " failed verification" << std::endl);
dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
<< kernelQueue[x]->workItem_output[0] << " "
<< "kernelQueue[x]->workItem_output[1]: "
<< kernelQueue[x]->workItem_output[1] << " "
@ -1714,11 +1705,9 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
<< "kernelQueue[x]->local_work_size[2]: "
<< kernelQueue[x]->local_work_size[2] << " "
<< kernelQueue[x]->swizzle_weights << " "
<< kernelQueue[x]->use_null_local << std::endl);
<< kernelQueue[x]->use_null_local);
} else {
dbgPrint(std::cout << "Kernel "
<< kernelQueue[x]->kernelName
<< " pass verification" << std::endl);
CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
}
}
#endif
@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
break;
} else {
kernelQueue[fastestKernel]->tested = true;
dbgPrint(std::cout << "Kernel " <<
kernelQueue[fastestKernel]->kernelName <<
" failed verification" << std::endl);
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
" failed verification");
failures++;
}
}
}
if (verification) {
dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
"> passed verification" << std::endl);
dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
"> passed verification");
CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
double out_w = output_w_;
double out_h = output_h_;
double out_z = M_;
double k_w = kernel_w_;
double k_h = kernel_h_;
double k_z = channels_;
float elapsedTime = kernelQueue[kernel_index_]->executionTime;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
} else {
dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
CV_LOG_INFO(NULL, "fallback to basic kernel");
options_.str(""); options_.clear(); // clear contents and state flags
createBasicKernel(1, 1, 1);
kernel_index_ = kernelQueue.size() - 1;

@ -206,8 +206,6 @@ __kernel void ConvolveBasic(
#elif defined KERNEL_IDLF
#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
@ -219,124 +217,76 @@ __kernel void
convolve_simd(
ELTWISE_DATA_ARG
FUSED_ARG
__global Dtype* inputs_base,
filter_qualifier Dtype* weights_base,
__global Dtype* inputs,
__global Dtype* weights,
BIAS_KERNEL_ARG
__global Dtype* outputs_base,
__global Dtype* outputs,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height)
{
__global Dtype* outputs = outputs_base;
__global Dtype* inputs = inputs_base;
filter_qualifier Dtype* weights = weights_base;
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
unsigned int fmg = get_group_id(2);
unsigned int lid = get_local_id(2);
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
int in_addr;
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
// find weights address of given neuron (lid is index)
unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
for(int i=0;i<OUT_BLOCK_SIZE;i++) {
out[i]=0.0f;
}
unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
int curr_local_y = ( lid / ( TILE_X / 4 ) );
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
int curr_y = or * STRIDE_Y + curr_local_y;
int curr_x = oc * STRIDE_X + curr_local_x;
int curr_y = or * STRIDE_Y;
int curr_x = oc * STRIDE_X + lid;
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
int saved_y = curr_y;
#endif
in_addr = input_batch_offset
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset
int in_addr = input_batch_offset
+ (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset
+ curr_x - INPUT_PAD_W; // x tile offset
union {
Dtype4 in_vec[INVEC_SIZE];
Dtype in_array[INVEC_SIZE * 4];
} in_buf;
Dtype in_buf[INVEC_SIZE];
for(int kd = 0; kd < INPUT_DEPTH; kd++)
{
int in_offset = in_addr;
int reg = 0;
LOOP(INVEC_SIZE, reg,
__attribute__((opencl_unroll_hint(INVEC_SIZE)))
for (int reg = 0; reg < INVEC_SIZE; reg++)
{
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
in_buf[reg] = inputs[in_offset];
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
if (curr_x < INPUT_PAD_W) {
in_buf.in_vec[reg].s0 = 0;
if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
else
in_buf.in_vec[reg].s1 = 0;
if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
else
in_buf.in_vec[reg].s2 = 0;
if (curr_x + 3 < input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
else
in_buf.in_vec[reg].s3 = 0;
} else {
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
if (curr_x + 1 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s1 = 0;
if (curr_x + 2 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s2 = 0;
if (curr_x + 3 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s3 = 0;
}
} else {
in_buf.in_vec[reg] = 0;
}
curr_y += TILE_Y_STRIDE;
#else
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
{
in_buf[reg] = 0;
}
#endif
curr_y += 1;
in_offset += INPUT_WIDTH;
}
in_offset += input_width * TILE_Y_STRIDE;
});
in_addr += input_height * input_width;
in_addr += INPUT_PITCH;
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
curr_y = saved_y;
#endif
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
#define WEIGHT_PREF 8
#else
#define WEIGHT_PREF 1
#endif
union {
Dtype w[WEIGHT_PREF];
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
INT_TYPE8 ui8;
#endif
} weight_buf;
Dtype weight_buf[WEIGHT_PREF];
int w_idx=0;
unsigned int orig_weight_addr = weight_addr;
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF;
#else
weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
weight_addr += SIMD_SIZE * 1;
#endif
for (int i = 0; i < WEIGHT_PREF; i++)
{
weight_buf[i] = weights[weight_addr];
weight_addr += SIMD_SIZE;
}
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
int kr = 0; // kr = Kernel Row
LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
@ -344,51 +294,29 @@ convolve_simd(
int kc = 0; // kc = Kernel Column
LOOP(KERNEL_WIDTH, kc,
{
for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
{
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
{
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
}
#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
&& ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
}
#if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
// need to do nothing
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
#else
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
#endif
#endif
#endif
weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
weight_addr += SIMD_SIZE;
++w_idx;
});
});
weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
}
// dead code to work around possible compiler bug.
if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
weight_addr -= WEIGHT_PREF * SIMD_SIZE;
}
fm = fm % ALIGNED_NUM_FILTERS;
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
#if LEFT_FILTERS > 0
if (fm < NUM_FILTERS)
#endif
{
unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
out_addr += or * output_width + oc;
// we need this address calculation for biases because we support views and batching
#if APPLY_BIAS
@ -396,13 +324,16 @@ convolve_simd(
#else
Dtype bias = 0;
#endif
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
{
if (r + or >= output_height) break;
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
{
if (c + oc >= output_width) break;
// this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
// this does a scattered write to SIMD_SIZE different feature maps,
// so that data within one map is contiguous, thus ready for input to next layer.
ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
}
}
}

Loading…
Cancel
Save