|
|
|
@ -60,6 +60,8 @@ |
|
|
|
|
#if defined WIN32 || defined _WIN32 |
|
|
|
|
#include <windows.h> |
|
|
|
|
#include <direct.h> |
|
|
|
|
#undef min |
|
|
|
|
#undef max |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
namespace cv { namespace dnn { namespace ocl4dnn { |
|
|
|
@ -68,6 +70,30 @@ typedef std::map<std::string, std::string> kernel_hash_t; |
|
|
|
|
static kernel_hash_t kernelConfigMap; |
|
|
|
|
static bool defaultConfigLoaded = false; |
|
|
|
|
|
|
|
|
|
static bool enableWorkaroundIDLF() |
|
|
|
|
{ |
|
|
|
|
static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_WORKAROUND_IDLF", true); |
|
|
|
|
return param; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static bool dumpFailedResult() |
|
|
|
|
{ |
|
|
|
|
static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_DUMP_FAILED_RESULT", false); |
|
|
|
|
return param; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static size_t testAllKernels() |
|
|
|
|
{ |
|
|
|
|
static size_t param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_TEST_ALL_KERNELS", 0); |
|
|
|
|
return param; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static bool raiseOnCheckError() |
|
|
|
|
{ |
|
|
|
|
static bool param = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_TUNING_RAISE_CHECK_ERROR", false); |
|
|
|
|
return param; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static std::string sanitize(const std::string& s) |
|
|
|
|
{ |
|
|
|
|
std::string s_ = s; |
|
|
|
@ -1221,9 +1247,6 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom, |
|
|
|
|
kernelConfig* config, |
|
|
|
|
UMat &verifyTop) |
|
|
|
|
{ |
|
|
|
|
|
|
|
|
|
uint32_t verificationFail = 0; |
|
|
|
|
|
|
|
|
|
if (config->verified) |
|
|
|
|
return true; |
|
|
|
|
else if (config->tested) |
|
|
|
@ -1236,6 +1259,8 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom, |
|
|
|
|
convolve(bottom, top, weight, bias, numImages, config); |
|
|
|
|
tuned_ = saved_tuned; |
|
|
|
|
|
|
|
|
|
config->tested = true; |
|
|
|
|
|
|
|
|
|
UMat new_top, new_verify_top; |
|
|
|
|
Mat mat_top, mat_verify_top; |
|
|
|
|
if (use_half_) |
|
|
|
@ -1254,41 +1279,88 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom, |
|
|
|
|
const float* data = mat_top.ptr<float>(); |
|
|
|
|
const float* verify_data = mat_verify_top.ptr<float>(); |
|
|
|
|
|
|
|
|
|
for (int32_t n = 0; n < num_; ++n) { |
|
|
|
|
for (int32_t g = 0; g < group_; ++g) { |
|
|
|
|
int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g; |
|
|
|
|
for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++) |
|
|
|
|
for (int h = 0; h < output_h_ && !verificationFail; h++) |
|
|
|
|
for (int w = 0; w < output_w_; w++) { |
|
|
|
|
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; |
|
|
|
|
|
|
|
|
|
float error_factor = fabs(data[offset] - verify_data[offset]); |
|
|
|
|
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && |
|
|
|
|
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g |
|
|
|
|
<< " out_ch " << out_ch << " h " << h << " w " << w |
|
|
|
|
<< " got " << data[offset] << " expected " << verify_data[offset]); |
|
|
|
|
verificationFail = 1; |
|
|
|
|
goto out; |
|
|
|
|
} |
|
|
|
|
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && |
|
|
|
|
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g |
|
|
|
|
<< " out_ch " << out_ch << " h " << h << " w " << w |
|
|
|
|
<< " got " << data[offset] << " expected " << verify_data[offset]); |
|
|
|
|
verificationFail = 1; |
|
|
|
|
goto out; |
|
|
|
|
int error_slice_offset = 0; |
|
|
|
|
int error_slice = 0; |
|
|
|
|
float relative_eps = use_half_ ? 0.1f : 0.01f; |
|
|
|
|
|
|
|
|
|
size_t errors = 0; |
|
|
|
|
|
|
|
|
|
double rel_err = norm(mat_top.reshape(1, 1), mat_verify_top.reshape(1, 1), NORM_L1 | NORM_RELATIVE); |
|
|
|
|
if (rel_err >= relative_eps) |
|
|
|
|
{ |
|
|
|
|
for (int32_t n = 0; n < num_; ++n) { |
|
|
|
|
for (int32_t g = 0; g < group_; ++g) { |
|
|
|
|
int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g; |
|
|
|
|
for (int out_ch = 0; out_ch < M_; out_ch++) |
|
|
|
|
for (int h = 0; h < output_h_; h++) |
|
|
|
|
for (int w = 0; w < output_w_; w++) { |
|
|
|
|
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; |
|
|
|
|
|
|
|
|
|
bool has_error = !(data[offset] == data[offset]); // is NaN
|
|
|
|
|
if (!has_error) |
|
|
|
|
{ |
|
|
|
|
float error_factor = std::fabs(data[offset] - verify_data[offset]); |
|
|
|
|
float base_value_abs = std::max(1e-3f, std::fabs(verify_data[offset])); |
|
|
|
|
has_error = error_factor > relative_eps * base_value_abs; |
|
|
|
|
} |
|
|
|
|
if (has_error) |
|
|
|
|
{ |
|
|
|
|
if (errors == 0) |
|
|
|
|
{ |
|
|
|
|
error_slice = (int)(offset / (output_w_ * output_h_)); |
|
|
|
|
error_slice_offset = (int)(offset % (output_w_ * output_h_)); |
|
|
|
|
CV_LOG_ERROR(NULL, "Kernel: " << config->kernelName); |
|
|
|
|
} |
|
|
|
|
if (errors < 10) |
|
|
|
|
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g |
|
|
|
|
<< " out_ch " << out_ch << " h " << h << " w " << w |
|
|
|
|
<< " (offset: " << offset << ")" |
|
|
|
|
<< " got " << data[offset] << " expected " << verify_data[offset]); |
|
|
|
|
errors++; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
out: |
|
|
|
|
if (verificationFail == 1) |
|
|
|
|
|
|
|
|
|
if (errors) |
|
|
|
|
{ |
|
|
|
|
if (dumpFailedResult()) |
|
|
|
|
{ |
|
|
|
|
try |
|
|
|
|
{ |
|
|
|
|
int n_outputs = (int)(mat_top.size[0]*mat_top.size[1]); |
|
|
|
|
int slice_size = (int)(mat_top.total() / n_outputs); |
|
|
|
|
Rect roi(0, 0, slice_size, n_outputs); |
|
|
|
|
roi.width = std::min(roi.width, 32); |
|
|
|
|
roi.height = std::min(roi.height, 16); |
|
|
|
|
roi.x = std::max(0, std::min(slice_size - roi.width, error_slice_offset - roi.width/2)); |
|
|
|
|
roi.y = std::max(0, std::min(n_outputs - roi.height, error_slice - roi.height/2)); |
|
|
|
|
std::cout << "roi = " << roi << " errors=" << errors << std::endl; |
|
|
|
|
std::cout << "mat_top = " << shape(mat_top) << std::endl |
|
|
|
|
<< mat_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl; |
|
|
|
|
std::cout << "verify_top = " << shape(mat_verify_top) << std::endl |
|
|
|
|
<< mat_verify_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl; |
|
|
|
|
} |
|
|
|
|
catch (const std::exception& e) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Results dump failed: " << e.what()); |
|
|
|
|
} |
|
|
|
|
catch (...) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Results dump failed") |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (raiseOnCheckError()) |
|
|
|
|
CV_Error_(Error::StsError, ("ocl4dnn tuning verification failed: %s (errors %lld)", config->kernelName.c_str(), (long long int)errors)); |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
config->verified = true; |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template<typename Dtype> |
|
|
|
@ -1408,6 +1480,17 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth, |
|
|
|
|
|
|
|
|
|
setupKernel(); |
|
|
|
|
|
|
|
|
|
if (enableWorkaroundIDLF() && ocl::Device::getDefault().intelSubgroupsSupport()) |
|
|
|
|
{ |
|
|
|
|
// Issues are observed with these kernels: 3x1 (covered by tests), 2x1, 4x1, 5x1, 3x2
|
|
|
|
|
// kernels 1x3, 3x3, 2x3 are good
|
|
|
|
|
if (pad_h_ != 0 && kernel_w_ <= simd_size && kernel_h_ <= 2) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_INFO(NULL, "DNN(workaround): skip IDLF kernel: " << kernel_name_); |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ocl::Program program = compileKernel(); |
|
|
|
|
if (program.ptr()) |
|
|
|
|
{ |
|
|
|
@ -1623,13 +1706,38 @@ void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom, |
|
|
|
|
generateTunerItems(tunerItems); |
|
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1)); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < tunerItems.size(); i++) { |
|
|
|
|
for (int i = 0; i < tunerItems.size(); i++) |
|
|
|
|
{ |
|
|
|
|
if (createConvolutionKernel(tunerItems[i]->kernelType, |
|
|
|
|
tunerItems[i]->blockWidth, |
|
|
|
|
tunerItems[i]->blockHeight, |
|
|
|
|
tunerItems[i]->blockDepth)) { |
|
|
|
|
tunerItems[i]->blockDepth)) |
|
|
|
|
{ |
|
|
|
|
int kernelIdx = kernelQueue.size() - 1; |
|
|
|
|
if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) { |
|
|
|
|
kernelConfig* config = kernelQueue[kernelIdx].get(); |
|
|
|
|
bool failed = false; |
|
|
|
|
const size_t testCount = testAllKernels(); |
|
|
|
|
for(int t = 0; t < testCount; t++) |
|
|
|
|
{ |
|
|
|
|
try |
|
|
|
|
{ |
|
|
|
|
config->tested = false; |
|
|
|
|
config->verified = false; |
|
|
|
|
if (!verifyResult(bottom, top, weight, bias, numImages, config, verifyTop)) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); |
|
|
|
|
failed = true; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
catch (...) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); |
|
|
|
|
throw; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (!failed && verifyResult(bottom, top, weight, bias, numImages, config, verifyTop)) |
|
|
|
|
{ |
|
|
|
|
bestKernelConfig = kernelQueue[kernelIdx]; |
|
|
|
|
if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && |
|
|
|
|
bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE) |
|
|
|
@ -1685,42 +1793,50 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom, |
|
|
|
|
tunerItems[i]->blockHeight, |
|
|
|
|
tunerItems[i]->blockDepth); |
|
|
|
|
|
|
|
|
|
for (int32_t x = 0; x < kernelQueue.size(); x++) { |
|
|
|
|
kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages, |
|
|
|
|
kernelQueue[x]); |
|
|
|
|
#ifdef TEST_ALL_KERNELS |
|
|
|
|
if (kernelQueue[x]->tested == false) { |
|
|
|
|
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); |
|
|
|
|
if (verified == false) { |
|
|
|
|
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification"); |
|
|
|
|
CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: " |
|
|
|
|
<< kernelQueue[x]->workItem_output[0] << " " |
|
|
|
|
<< "kernelQueue[x]->workItem_output[1]: " |
|
|
|
|
<< kernelQueue[x]->workItem_output[1] << " " |
|
|
|
|
<< "kernelQueue[x]->workItem_output[2]: " |
|
|
|
|
<< kernelQueue[x]->workItem_output[2] << " " |
|
|
|
|
<< "kernelQueue[x]->kernelType: " |
|
|
|
|
<< kernelQueue[x]->kernelType << " " |
|
|
|
|
<< "kernelQueue[x]->global_work_size[0]: " |
|
|
|
|
<< kernelQueue[x]->global_work_size[0] << " " |
|
|
|
|
<< "kernelQueue[x]->global_work_size[1]: " |
|
|
|
|
<< kernelQueue[x]->global_work_size[1] << " " |
|
|
|
|
<< "kernelQueue[x]->global_work_size[2]: " |
|
|
|
|
<< kernelQueue[x]->global_work_size[2] << " " |
|
|
|
|
<< "kernelQueue[x]->local_work_size[0]: " |
|
|
|
|
<< kernelQueue[x]->local_work_size[0] << " " |
|
|
|
|
<< "kernelQueue[x]->local_work_size[1]: " |
|
|
|
|
<< kernelQueue[x]->local_work_size[1] << " " |
|
|
|
|
<< "kernelQueue[x]->local_work_size[2]: " |
|
|
|
|
<< kernelQueue[x]->local_work_size[2] << " " |
|
|
|
|
<< kernelQueue[x]->swizzle_weights << " " |
|
|
|
|
<< kernelQueue[x]->use_null_local); |
|
|
|
|
} else { |
|
|
|
|
CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification"); |
|
|
|
|
const size_t testCount = testAllKernels(); |
|
|
|
|
for (int32_t x = 0; x < kernelQueue.size(); x++) |
|
|
|
|
{ |
|
|
|
|
kernelConfig* config = kernelQueue[x]; |
|
|
|
|
config->executionTime = timedConvolve(bottom, top, weight, bias, numImages, config); |
|
|
|
|
for(int t = 0; t < testCount; t++) |
|
|
|
|
{ |
|
|
|
|
try |
|
|
|
|
{ |
|
|
|
|
config->tested = false; |
|
|
|
|
config->verified = false; |
|
|
|
|
bool verified = verifyResult(bottom, top, weight, bias, numImages, config, verifyTop); |
|
|
|
|
if (verified == false) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Kernel " << config->kernelName << " failed verification"); |
|
|
|
|
CV_LOG_ERROR(NULL, "workItem=" |
|
|
|
|
<< config->workItem_output[0] << "," |
|
|
|
|
<< config->workItem_output[1] << "," |
|
|
|
|
<< config->workItem_output[2] << " " |
|
|
|
|
<< "kernelType: " << config->kernelType << " " |
|
|
|
|
<< "global_work_size=" |
|
|
|
|
<< config->global_work_size[0] << "," |
|
|
|
|
<< config->global_work_size[1] << "," |
|
|
|
|
<< config->global_work_size[2] << " " |
|
|
|
|
<< "local_work_size=" |
|
|
|
|
<< config->local_work_size[0] << "," |
|
|
|
|
<< config->local_work_size[1] << "," |
|
|
|
|
<< config->local_work_size[2] << " " |
|
|
|
|
<< config->swizzle_weights << " " |
|
|
|
|
<< config->use_null_local); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
CV_LOG_VERBOSE(NULL, "Kernel " << config->kernelName << " pass verification"); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
catch (...) |
|
|
|
|
{ |
|
|
|
|
CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); |
|
|
|
|
throw; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int32_t failures = 0; |
|
|
|
|
bool verification = false; |
|
|
|
|
if (kernelQueue.size()) { |
|
|
|
@ -1739,12 +1855,10 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom, |
|
|
|
|
// Test fastest kernel
|
|
|
|
|
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop); |
|
|
|
|
if (verified == true) { |
|
|
|
|
kernelQueue[fastestKernel]->verified = true; |
|
|
|
|
kernel_index_ = fastestKernel; |
|
|
|
|
verification = true; |
|
|
|
|
break; |
|
|
|
|
} else { |
|
|
|
|
kernelQueue[fastestKernel]->tested = true; |
|
|
|
|
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName << |
|
|
|
|
" failed verification"); |
|
|
|
|
failures++; |
|
|
|
|