update convolution opencl kernels in dnn module (#11762)

* optimize ocl kernel enqueue in fc layer

Signed-off-by: Li Peng <peng.li@intel.com>

* use CV_LOG_INFO in convolution auto tuning

Signed-off-by: Li Peng <peng.li@intel.com>

* update convolution IDLF kernel

extend parameter tuning range, also cleanup
ocl kernel implementation

Signed-off-by: Li Peng <peng.li@intel.com>

* update in-memory convolution cache config

fp16 and fp32 cache config are stored separately

Signed-off-by: Li Peng <peng.li@intel.com>
pull/11831/head^2
Li, Peng 7 years ago committed by Vadim Pisarevsky
parent a2bc075924
commit ab8022f74e
  1. 4
      modules/dnn/src/layers/fully_connected_layer.cpp
  2. 1166
      modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
  3. 148
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  4. 181
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -310,7 +310,6 @@ public:
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config)); innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
} }
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
MatShape inshape, outshape; MatShape inshape, outshape;
@ -320,7 +319,6 @@ public:
UMat srcMat, dstMat; UMat srcMat, dstMat;
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]); srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0], if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(), (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
@ -332,6 +330,7 @@ public:
if (!use_half && bias && (outerSize > 1)) if (!use_half && bias && (outerSize > 1))
{ {
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
UMat& biases = umat_blobs[1]; UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
} }
@ -354,6 +353,7 @@ public:
if (bias) if (bias)
{ {
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
UMat& biases = umat_blobs[1]; UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
} }

File diff suppressed because it is too large Load Diff

@ -55,6 +55,7 @@
#include "../include/math_functions.hpp" #include "../include/math_functions.hpp"
#include "../include/default_kernel_config.hpp" #include "../include/default_kernel_config.hpp"
#include "opencv2/dnn/shape_utils.hpp" #include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/utils/logger.hpp"
#if defined WIN32 || defined _WIN32 #if defined WIN32 || defined _WIN32
#include <windows.h> #include <windows.h>
@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
{ {
CV_Assert(defaultConfigLoaded == false); CV_Assert(defaultConfigLoaded == false);
CV_Assert(kernelConfigMap.empty()); CV_Assert(kernelConfigMap.empty());
const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
/* fp32 config */
size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
sizeof(default_kernel_config_intel_fp32[0]) / 2;
for (size_t i = 0; i < numConfigs; i++) for (size_t i = 0; i < numConfigs; i++)
{ {
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i]; std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
if (!cache_path.empty()) if (!cache_path.empty())
{ {
std::string cacheFile = cache_path + sanitize(key); std::string cacheFile = cache_path + sanitize(key);
@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
} }
std::pair<std::string, std::string> entry( std::pair<std::string, std::string> entry(
key, key,
default_kernel_config_intel[2 * i + 1]); default_kernel_config_intel_fp32[2 * i + 1]);
kernelConfigMap.insert(entry); kernelConfigMap.insert(entry);
} }
/* fp16 config */
numConfigs = sizeof(default_kernel_config_intel_fp16) /
sizeof(default_kernel_config_intel_fp16[0]) / 2;
for (size_t i = 0; i < numConfigs; i++)
{
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
if (!cache_path.empty())
{
std::string cacheFile = cache_path + sanitize(key);
std::ifstream cachedKernel(cacheFile.c_str());
if (cachedKernel)
continue; // external configuration found, skip builtin
}
std::pair<std::string, std::string> entry(
key,
default_kernel_config_intel_fp16[2 * i + 1]);
kernelConfigMap.insert(entry);
}
defaultConfigLoaded = true; defaultConfigLoaded = true;
} }
@ -311,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
// options // options
options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_; options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
options_ << " -cl-mad-enable";
if (clOptionSupport("-cl-no-subgroup-ifp")) if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp "; options_ << " -cl-no-subgroup-ifp ";
// defs // defs
int32_t output_width = output_w_;
int32_t output_height = output_h_;
int32_t output_block_width = blockM; int32_t output_block_width = blockM;
int32_t output_block_height = blockK; int32_t output_block_height = blockK;
const int32_t last_block_width = (output_width % output_block_width == 0) ? int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
output_block_width : output_width % output_block_width;
const int32_t last_block_height = (output_height % output_block_height == 0) ?
output_block_height : output_height % output_block_height;
int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_; int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
int tile_y_stride = (4 * simd_size) / tile_x; int invec_size = tile_y;
int invec_size = divUp(tile_y, tile_y_stride);
addDef("SIMD_SIZE", simd_size); addDef("SIMD_SIZE", simd_size);
addDef("filter_qualifier", "__global");
addDef("OUT_BLOCK_WIDTH", output_block_width); addDef("OUT_BLOCK_WIDTH", output_block_width);
addDef("OUT_BLOCK_HEIGHT", output_block_height); addDef("OUT_BLOCK_HEIGHT", output_block_height);
addDef("LAST_BLOCK_WIDTH", last_block_width);
addDef("LAST_BLOCK_HEIGHT", last_block_height);
addDef("INPUT_DEPTH", channels_ / group_); addDef("INPUT_DEPTH", channels_ / group_);
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_); addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
addDef("TOTAL_OUTPUT_DEPTH", num_output_); addDef("TOTAL_OUTPUT_DEPTH", num_output_);
addDef("NUM_FILTERS", M_); addDef("NUM_FILTERS", M_);
addDef("TILE_X", tile_x); addDef("TILE_X", tile_x);
addDef("TILE_Y", tile_y); addDef("TILE_Y", tile_y);
addDef("TILE_Y_STRIDE", tile_y_stride);
addDef("INVEC_SIZE", invec_size); addDef("INVEC_SIZE", invec_size);
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
addDef("INPUT_PITCH", (width_ * height_));
addDef("OUTPUT_PITCH", (output_w_ * output_h_));
addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
addDef("INPUT_WIDTH", width_);
addDef("INPUT_HEIGHT", height_);
addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
setFusionDefine(fused_activ_, fused_eltwise_); setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@ -567,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
return; return;
} }
#define dbg
#ifdef dbg
#define dbgPrint(x) (x)
#else
#define dbgPrint(x)
#endif
// For large enough input size, we do not need to tune kernels for different // For large enough input size, we do not need to tune kernels for different
// size. The reason is with large input size, there will be enough work items // size. The reason is with large input size, there will be enough work items
// to feed al the EUs. // to feed al the EUs.
@ -584,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::generateKey() void OCL4DNNConvSpatial<Dtype>::generateKey()
{ {
std::string precision = (use_half_) ? "FP16" : "FP32";
std::stringstream keyBuilder; std::stringstream keyBuilder;
// FIXME: to support fuse? // FIXME: to support fuse?
keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_" keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
@ -597,7 +613,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "num" << num_ << "_" << "num" << num_ << "_"
<< "M" << M_ << "_" << "M" << M_ << "_"
<< "activ" << fused_activ_ << "_" << "activ" << fused_activ_ << "_"
<< "eltwise" << fused_eltwise_; << "eltwise" << fused_eltwise_ << "_"
<< precision;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
<< "_" << blockHeight << "_" << blockHeight
<< "_" << blockDepth; << "_" << blockDepth;
if (!use_half_)
keyBuilder << "_float";
else
keyBuilder << "_half";
return keyBuilder.str(); return keyBuilder.str();
} }
@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
cv::ocl::Timer timer(queue); cv::ocl::Timer timer(queue);
timer.start(); timer.start();
bool res = true;; bool res = true;;
dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl); CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
tuned_ = true; tuned_ = true;
int loop_cnt = 4; int loop_cnt = 4;
for (int i = 0; i < loop_cnt; i++) { for (int i = 0; i < loop_cnt; i++) {
@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
} }
float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt; float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
#ifdef dbg
double out_w = output_w_; double out_w = output_w_;
double out_h = output_h_; double out_h = output_h_;
double out_z = M_; double out_z = M_;
@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
double k_h = kernel_h_; double k_h = kernel_h_;
double k_z = channels_; double k_z = channels_;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_; double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9) CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
<< std::endl; CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))
<< std::endl;
#if 0
std::cout << "Estimated utilization: " <<
((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
<< std::endl;
#endif
#endif
return elapsedTime; return elapsedTime;
} }
@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{ {
dbgPrint(printf("test verification failed @ image %d group %d" CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
"out_ch %d h %d w %d got %G expected %G\n", << " out_ch " << out_ch << " h " << h << " w " << w
n, g, out_ch, h, w, data[offset], verify_data[offset])); << " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1; verificationFail = 1;
goto out; goto out;
} }
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{ {
dbgPrint(printf("test verification failed @ image %d group %d" CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
"out_ch %d h %d w %d got %G expected %G\n", << " out_ch " << out_ch << " h " << h << " w " << w
n, g, out_ch, h, w, data[offset], verify_data[offset])); << " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1; verificationFail = 1;
goto out; goto out;
} }
@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
return; return;
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ; int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
int tile_x = alignSize(actual_tile_x, 4); int tile_x = alignSize(actual_tile_x, simd_size);
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_; if (tile_x > simd_size)
if (tile_x > (4 * simd_size))
return;
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
return; return;
int tile_y_stride = (4 * simd_size) / tile_x; if (blockM * blockK > block_size_max)
int invec_size = divUp(tile_y, tile_y_stride);
if (invec_size > 4)
return; return;
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size)); tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
for (uint32_t height = height_max; height > 0; height--) for (uint32_t height = height_max; height > 0; height--)
{ {
generate_idlf_tuneritems(tunerItems, width, height, simd_size); generate_idlf_tuneritems(tunerItems, width, height, simd_size);
if (tunerItems.size() >= 8 && height == 2)
break;
} }
if (tunerItems.size() >= 12 && width == 2)
break;
} }
} }
} }
@ -1690,10 +1683,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
if (kernelQueue[x]->tested == false) { if (kernelQueue[x]->tested == false) {
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
if (verified == false) { if (verified == false) {
dbgPrint(std::cout << "Kernel " CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
<< kernelQueue[x]->kernelName CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
<< " failed verification" << std::endl);
dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
<< kernelQueue[x]->workItem_output[0] << " " << kernelQueue[x]->workItem_output[0] << " "
<< "kernelQueue[x]->workItem_output[1]: " << "kernelQueue[x]->workItem_output[1]: "
<< kernelQueue[x]->workItem_output[1] << " " << kernelQueue[x]->workItem_output[1] << " "
@ -1714,11 +1705,9 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
<< "kernelQueue[x]->local_work_size[2]: " << "kernelQueue[x]->local_work_size[2]: "
<< kernelQueue[x]->local_work_size[2] << " " << kernelQueue[x]->local_work_size[2] << " "
<< kernelQueue[x]->swizzle_weights << " " << kernelQueue[x]->swizzle_weights << " "
<< kernelQueue[x]->use_null_local << std::endl); << kernelQueue[x]->use_null_local);
} else { } else {
dbgPrint(std::cout << "Kernel " CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
<< kernelQueue[x]->kernelName
<< " pass verification" << std::endl);
} }
} }
#endif #endif
@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
break; break;
} else { } else {
kernelQueue[fastestKernel]->tested = true; kernelQueue[fastestKernel]->tested = true;
dbgPrint(std::cout << "Kernel " << CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
kernelQueue[fastestKernel]->kernelName << " failed verification");
" failed verification" << std::endl);
failures++; failures++;
} }
} }
} }
if (verification) { if (verification) {
dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName << CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
"> passed verification" << std::endl); "> passed verification");
dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl); CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
double out_w = output_w_;
double out_h = output_h_;
double out_z = M_;
double k_w = kernel_w_;
double k_h = kernel_h_;
double k_z = channels_;
float elapsedTime = kernelQueue[kernel_index_]->executionTime;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
} else { } else {
dbgPrint(std::cout << "fallback to basic kernel" << std::endl); CV_LOG_INFO(NULL, "fallback to basic kernel");
options_.str(""); options_.clear(); // clear contents and state flags options_.str(""); options_.clear(); // clear contents and state flags
createBasicKernel(1, 1, 1); createBasicKernel(1, 1, 1);
kernel_index_ = kernelQueue.size() - 1; kernel_index_ = kernelQueue.size() - 1;

@ -206,8 +206,6 @@ __kernel void ConvolveBasic(
#elif defined KERNEL_IDLF #elif defined KERNEL_IDLF
#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image. // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH // NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
@ -219,124 +217,76 @@ __kernel void
convolve_simd( convolve_simd(
ELTWISE_DATA_ARG ELTWISE_DATA_ARG
FUSED_ARG FUSED_ARG
__global Dtype* inputs_base, __global Dtype* inputs,
filter_qualifier Dtype* weights_base, __global Dtype* weights,
BIAS_KERNEL_ARG BIAS_KERNEL_ARG
__global Dtype* outputs_base, __global Dtype* outputs,
const ushort input_width, const ushort input_width,
const ushort input_height, const ushort input_height,
const ushort output_width, const ushort output_width,
const ushort output_height) const ushort output_height)
{ {
__global Dtype* outputs = outputs_base;
__global Dtype* inputs = inputs_base;
filter_qualifier Dtype* weights = weights_base;
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
unsigned int fmg = get_group_id(2); unsigned int fmg = get_group_id(2);
unsigned int lid = get_local_id(2); unsigned int lid = get_local_id(2);
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT]; Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
int in_addr;
// find weights address of given neuron (lid is index) // find weights address of given neuron (lid is index)
unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
for(int i=0;i<OUT_BLOCK_SIZE;i++) { unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
out[i]=0.0f;
}
unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS; unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE; int curr_y = or * STRIDE_Y;
int curr_x = oc * STRIDE_X + lid;
int curr_local_y = ( lid / ( TILE_X / 4 ) );
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
int curr_y = or * STRIDE_Y + curr_local_y;
int curr_x = oc * STRIDE_X + curr_local_x;
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
int saved_y = curr_y; int saved_y = curr_y;
#endif #endif
in_addr = input_batch_offset int in_addr = input_batch_offset
+ (curr_y - INPUT_PAD_H) * input_width // y tile offset + (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset
+ curr_x - INPUT_PAD_W; // x tile offset + curr_x - INPUT_PAD_W; // x tile offset
union {
Dtype4 in_vec[INVEC_SIZE]; Dtype in_buf[INVEC_SIZE];
Dtype in_array[INVEC_SIZE * 4];
} in_buf;
for(int kd = 0; kd < INPUT_DEPTH; kd++) for(int kd = 0; kd < INPUT_DEPTH; kd++)
{ {
int in_offset = in_addr; int in_offset = in_addr;
int reg = 0; __attribute__((opencl_unroll_hint(INVEC_SIZE)))
LOOP(INVEC_SIZE, reg, for (int reg = 0; reg < INVEC_SIZE; reg++)
{ {
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) { in_buf[reg] = inputs[in_offset];
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
if (curr_x < INPUT_PAD_W) { curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
in_buf.in_vec[reg].s0 = 0; {
if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W) in_buf[reg] = 0;
in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); }
else
in_buf.in_vec[reg].s1 = 0;
if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
else
in_buf.in_vec[reg].s2 = 0;
if (curr_x + 3 < input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
else
in_buf.in_vec[reg].s3 = 0;
} else {
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
if (curr_x + 1 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s1 = 0;
if (curr_x + 2 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s2 = 0;
if (curr_x + 3 >= input_width + INPUT_PAD_W)
in_buf.in_vec[reg].s3 = 0;
}
} else {
in_buf.in_vec[reg] = 0;
}
curr_y += TILE_Y_STRIDE;
#else
VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
#endif #endif
curr_y += 1;
in_offset += INPUT_WIDTH;
} }
in_offset += input_width * TILE_Y_STRIDE;
}); in_addr += INPUT_PITCH;
in_addr += input_height * input_width;
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
curr_y = saved_y; curr_y = saved_y;
#endif #endif
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 Dtype weight_buf[WEIGHT_PREF];
#define WEIGHT_PREF 8
#else
#define WEIGHT_PREF 1
#endif
union {
Dtype w[WEIGHT_PREF];
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
INT_TYPE8 ui8;
#endif
} weight_buf;
int w_idx=0; int w_idx=0;
unsigned int orig_weight_addr = weight_addr; for (int i = 0; i < WEIGHT_PREF; i++)
#if KERNEL_WIDTH * KERNEL_HEIGHT != 1 {
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]); weight_buf[i] = weights[weight_addr];
weight_addr += SIMD_SIZE * WEIGHT_PREF; weight_addr += SIMD_SIZE;
#else }
weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
weight_addr += SIMD_SIZE * 1;
#endif
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) #define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
int kr = 0; // kr = Kernel Row int kr = 0; // kr = Kernel Row
LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
@ -344,51 +294,29 @@ convolve_simd(
int kc = 0; // kc = Kernel Column int kc = 0; // kc = Kernel Column
LOOP(KERNEL_WIDTH, kc, LOOP(KERNEL_WIDTH, kc,
{ {
for(int br=0; br < OUT_BLOCK_HEIGHT; br++) { for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) { {
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X); for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); {
} Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
} }
#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
// We assume KERNEL_W is equal to KERNEL_H here.
if ((w_idx + 1) % WEIGHT_PREF == 0
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
&& ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
#endif
) {
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
} }
#if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0 weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
// need to do nothing weight_addr += SIMD_SIZE;
#else
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
#if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
weight_buf.w[0] = weights[weight_addr];
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
#elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
#else
weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
#endif
#endif
#endif
++w_idx; ++w_idx;
}); });
}); });
weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE; weight_addr -= WEIGHT_PREF * SIMD_SIZE;
}
// dead code to work around possible compiler bug.
if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
} }
fm = fm % ALIGNED_NUM_FILTERS; fm = fm % ALIGNED_NUM_FILTERS;
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) { #if LEFT_FILTERS > 0
unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height; if (fm < NUM_FILTERS)
#endif
{
unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
out_addr += or * output_width + oc; out_addr += or * output_width + oc;
// we need this address calculation for biases because we support views and batching // we need this address calculation for biases because we support views and batching
#if APPLY_BIAS #if APPLY_BIAS
@ -396,13 +324,16 @@ convolve_simd(
#else #else
Dtype bias = 0; Dtype bias = 0;
#endif #endif
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
{
if (r + or >= output_height) break; if (r + or >= output_height) break;
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) { for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
{
if (c + oc >= output_width) break; if (c + oc >= output_width) break;
// this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer. // this does a scattered write to SIMD_SIZE different feature maps,
// so that data within one map is contiguous, thus ready for input to next layer.
ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm); ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
} }
} }
} }

Loading…
Cancel
Save