Merge pull request #11397 from pengli:dnn_half

pull/11536/head
Alexander Alekhin 7 years ago
commit f0a4ec90b1
  1. 7
      modules/dnn/perf/perf_net.cpp
  2. 134
      modules/dnn/src/dnn.cpp
  3. 13
      modules/dnn/src/layers/batch_norm_layer.cpp
  4. 2
      modules/dnn/src/layers/blank_layer.cpp
  5. 12
      modules/dnn/src/layers/concat_layer.cpp
  6. 13
      modules/dnn/src/layers/convolution_layer.cpp
  7. 50
      modules/dnn/src/layers/detection_output_layer.cpp
  8. 81
      modules/dnn/src/layers/elementwise_layers.cpp
  9. 15
      modules/dnn/src/layers/eltwise_layer.cpp
  10. 2
      modules/dnn/src/layers/flatten_layer.cpp
  11. 21
      modules/dnn/src/layers/fully_connected_layer.cpp
  12. 4
      modules/dnn/src/layers/lrn_layer.cpp
  13. 21
      modules/dnn/src/layers/mvn_layer.cpp
  14. 5
      modules/dnn/src/layers/normalize_bbox_layer.cpp
  15. 6
      modules/dnn/src/layers/permute_layer.cpp
  16. 4
      modules/dnn/src/layers/pooling_layer.cpp
  17. 13
      modules/dnn/src/layers/prior_box_layer.cpp
  18. 5
      modules/dnn/src/layers/proposal_layer.cpp
  19. 4
      modules/dnn/src/layers/region_layer.cpp
  20. 5
      modules/dnn/src/layers/reorg_layer.cpp
  21. 2
      modules/dnn/src/layers/reshape_layer.cpp
  22. 10
      modules/dnn/src/layers/slice_layer.cpp
  23. 41
      modules/dnn/src/layers/softmax_layer.cpp
  24. 24
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  25. 212
      modules/dnn/src/ocl4dnn/src/math_functions.cpp
  26. 143
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  27. 14
      modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
  28. 5
      modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
  29. 19
      modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
  30. 8
      modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
  31. 33
      modules/dnn/src/opencl/activations.cl
  32. 29
      modules/dnn/src/opencl/batchnorm.cl
  33. 41
      modules/dnn/src/opencl/concat.cl
  34. 61
      modules/dnn/src/opencl/conv_layer_spatial.cl
  35. 22
      modules/dnn/src/opencl/eltwise.cl
  36. 1342
      modules/dnn/src/opencl/gemm_buffer.cl
  37. 383
      modules/dnn/src/opencl/gemm_image.cl
  38. 10
      modules/dnn/src/opencl/math.cl
  39. 96
      modules/dnn/src/opencl/matvec_mul.cl
  40. 86
      modules/dnn/src/opencl/mvn.cl
  41. 18
      modules/dnn/src/opencl/ocl4dnn_lrn.cl
  42. 5
      modules/dnn/src/opencl/ocl4dnn_pooling.cl
  43. 4
      modules/dnn/src/opencl/permute.cl
  44. 27
      modules/dnn/src/opencl/prior_box.cl
  45. 4
      modules/dnn/src/opencl/reorg.cl
  46. 6
      modules/dnn/src/opencl/slice.cl
  47. 8
      modules/dnn/src/opencl/softmax.cl
  48. 25
      modules/dnn/src/opencl/softmax_loss.cl
  49. 1
      modules/dnn/src/precomp.hpp
  50. 30
      modules/dnn/test/test_backends.cpp
  51. 42
      modules/dnn/test/test_caffe_importer.cpp
  52. 34
      modules/dnn/test/test_tf_importer.cpp

@ -121,7 +121,9 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
PERF_TEST_P_(DNNTestNetwork, ENet) PERF_TEST_P_(DNNTestNetwork, ENet)
{ {
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException(""); if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/Enet-model-best.net", "", "enet.yml", processNet("dnn/Enet-model-best.net", "", "enet.yml",
Mat(cv::Size(512, 256), CV_32FC3)); Mat(cv::Size(512, 256), CV_32FC3));
} }
@ -232,7 +234,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
#endif #endif
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL) tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
}; };
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases)); INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));

@ -499,7 +499,7 @@ public:
} }
} }
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate) void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool forceCreate, bool use_half)
{ {
if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate) if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS && !forceCreate)
{ {
@ -540,14 +540,14 @@ public:
{ {
// if dst already has been allocated with total(shape) elements, // if dst already has been allocated with total(shape) elements,
// it won't be recrreated and pointer of dst.data remains the same. // it won't be recrreated and pointer of dst.data remains the same.
dst.create(shape, CV_32F); dst.create(shape, use_half ? CV_16S : CV_32F);
addHost(lp, dst); addHost(lp, dst);
} }
} }
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
std::vector<LayerPin>& pinsForInternalBlobs, std::vector<LayerPin>& pinsForInternalBlobs,
bool forceCreate = false) bool forceCreate = false, bool use_half = false)
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
@ -618,7 +618,7 @@ public:
reuse(ld.inputBlobsId[0], blobPin); reuse(ld.inputBlobsId[0], blobPin);
} }
else else
reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate); reuseOrCreate(shapes[index], blobPin, *blobs[index], forceCreate, use_half);
} }
} }
} }
@ -656,7 +656,7 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
{ {
if (targetId == DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU)
return Ptr<BackendWrapper>(); return Ptr<BackendWrapper>();
else if (targetId == DNN_TARGET_OPENCL) else if (IS_DNN_OPENCL_TARGET(targetId))
return OpenCLBackendWrapper::create(m); return OpenCLBackendWrapper::create(m);
else else
CV_Error(Error::StsNotImplemented, "Unknown target identifier"); CV_Error(Error::StsNotImplemented, "Unknown target identifier");
@ -721,6 +721,7 @@ struct Net::Impl
bool netWasAllocated; bool netWasAllocated;
bool fusion; bool fusion;
std::vector<int64> layersTimings; std::vector<int64> layersTimings;
Mat output_blob;
Ptr<BackendWrapper> wrap(Mat& host) Ptr<BackendWrapper> wrap(Mat& host)
{ {
@ -737,7 +738,7 @@ struct Net::Impl
Ptr<BackendWrapper> baseBuffer = backendWrappers[data]; Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
if (preferableBackend == DNN_BACKEND_DEFAULT) if (preferableBackend == DNN_BACKEND_DEFAULT)
{ {
CV_Assert(preferableTarget == DNN_TARGET_OPENCL); CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
return OpenCLBackendWrapper::create(baseBuffer, host); return OpenCLBackendWrapper::create(baseBuffer, host);
} }
else if (preferableBackend == DNN_BACKEND_HALIDE) else if (preferableBackend == DNN_BACKEND_HALIDE)
@ -849,7 +850,7 @@ struct Net::Impl
if (!netWasAllocated || this->blobsToKeep != blobsToKeep_) if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
{ {
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL) if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
#ifndef HAVE_OPENCL #ifndef HAVE_OPENCL
{ {
CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU."); CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
@ -1034,7 +1035,7 @@ struct Net::Impl
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
if (preferableBackend == DNN_BACKEND_DEFAULT) if (preferableBackend == DNN_BACKEND_DEFAULT)
CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL); CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
else if (preferableBackend == DNN_BACKEND_HALIDE) else if (preferableBackend == DNN_BACKEND_HALIDE)
initHalideBackend(); initHalideBackend();
else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE) else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
@ -1369,7 +1370,9 @@ struct Net::Impl
std::vector<LayerPin> pinsForInternalBlobs; std::vector<LayerPin> pinsForInternalBlobs;
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
preferableBackend == DNN_BACKEND_INFERENCE_ENGINE); preferableBackend == DNN_BACKEND_INFERENCE_ENGINE,
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableTarget == DNN_TARGET_OPENCL_FP16);
ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
for (int i = 0; i < ld.outputBlobs.size(); ++i) for (int i = 0; i < ld.outputBlobs.size(); ++i)
{ {
@ -1439,7 +1442,7 @@ struct Net::Impl
// some other layers. // some other layers.
// TODO: OpenCL target support more fusion styles. // TODO: OpenCL target support more fusion styles.
if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL && if ( preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget) &&
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" && (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
ld.layerInstance->type != "MVN")) ) ld.layerInstance->type != "MVN")) )
continue; continue;
@ -1478,8 +1481,8 @@ struct Net::Impl
continue; // Go to the next layer. continue; // Go to the next layer.
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
if ( preferableTarget != DNN_TARGET_OPENCL || if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
(preferableTarget == DNN_TARGET_OPENCL && (IS_DNN_OPENCL_TARGET(preferableTarget) &&
nextData && nextData &&
((nextData->type == "ReLU") || ((nextData->type == "ReLU") ||
(nextData->type == "ChannelsPReLU") || (nextData->type == "ChannelsPReLU") ||
@ -1502,7 +1505,7 @@ struct Net::Impl
ld.outputBlobs = layers[lpNext.lid].outputBlobs; ld.outputBlobs = layers[lpNext.lid].outputBlobs;
ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers; ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
if ( preferableTarget == DNN_TARGET_OPENCL ) if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
{ {
if ( !activData->consumers.empty() ) if ( !activData->consumers.empty() )
{ {
@ -1514,7 +1517,7 @@ struct Net::Impl
} }
// fuse convlution layer followed by eltwise + relu // fuse convlution layer followed by eltwise + relu
if ( preferableTarget == DNN_TARGET_OPENCL ) if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
{ {
Ptr<EltwiseLayer> nextEltwiseLayer; Ptr<EltwiseLayer> nextEltwiseLayer;
if( nextData ) if( nextData )
@ -1727,6 +1730,13 @@ struct Net::Impl
for(int i = 0; i < layers[0].outputBlobs.size(); i++) for(int i = 0; i < layers[0].outputBlobs.size(); i++)
{ {
CV_Assert(layers[0].outputBlobs[i].total()); CV_Assert(layers[0].outputBlobs[i].total());
if (layers[0].outputBlobs[i].depth() == CV_32F &&
preferableBackend == DNN_BACKEND_DEFAULT &&
preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat mat = layers[0].outputBlobs[i].clone();
convertFp16(mat, layers[0].outputBlobs[i]);
}
inputShapes.push_back(shape(layers[0].outputBlobs[i])); inputShapes.push_back(shape(layers[0].outputBlobs[i]));
} }
LayersShapesMap layersShapes; LayersShapesMap layersShapes;
@ -1772,7 +1782,7 @@ struct Net::Impl
{ {
if( !ld.skip ) if( !ld.skip )
{ {
if (preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL) if (preferableBackend == DNN_BACKEND_DEFAULT && IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers), layer->forward(OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers),
@ -1937,7 +1947,14 @@ struct Net::Impl
// Transfer data to CPU if it's require. // Transfer data to CPU if it's require.
ld.outputBlobsWrappers[pin.oid]->copyToHost(); ld.outputBlobsWrappers[pin.oid]->copyToHost();
} }
return ld.outputBlobs[pin.oid];
if (ld.outputBlobs[pin.oid].depth() == CV_16S)
{
convertFp16(ld.outputBlobs[pin.oid], output_blob);
return output_blob;
}
else
return ld.outputBlobs[pin.oid];
} }
Mat getBlob(String outputName) Mat getBlob(String outputName)
@ -2080,7 +2097,7 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
if (outputBlobs.isUMat()) if (outputBlobs.isUMat())
{ {
outputBlobs.assign(ld.outputBlobs[pin.oid].getUMat(ACCESS_RW)); outputBlobs.assign(impl->getBlob(layerName).getUMat(ACCESS_RW));
} }
else if (outputBlobs.isMat()) else if (outputBlobs.isMat())
{ {
@ -2096,17 +2113,33 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
ld.outputBlobsWrappers[i]->copyToHost(); ld.outputBlobsWrappers[i]->copyToHost();
} }
} }
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj(); if (ld.outputBlobs[0].depth() == CV_32F)
outputvec = ld.outputBlobs; {
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = ld.outputBlobs;
} else {
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec.resize(ld.outputBlobs.size());
for (int i = 0; i < outputvec.size(); i++)
convertFp16(ld.outputBlobs[i], outputvec[i]);
}
} }
else if (outputBlobs.isUMatVector()) else if (outputBlobs.isUMatVector())
{ {
std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj(); std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
if (impl->preferableBackend == DNN_BACKEND_DEFAULT && if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
impl->preferableTarget == DNN_TARGET_OPENCL) IS_DNN_OPENCL_TARGET(impl->preferableTarget))
{ {
outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); if (impl->preferableTarget == DNN_TARGET_OPENCL)
outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
{
std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
outputvec.resize(out_vec.size());
for (int i = 0; i < out_vec.size(); i++)
convertFp16(out_vec[i], outputvec[i]);
}
} }
else else
{ {
@ -2194,6 +2227,16 @@ void Net::setPreferableTarget(int targetId)
if( impl->preferableTarget != targetId ) if( impl->preferableTarget != targetId )
{ {
impl->preferableTarget = targetId; impl->preferableTarget = targetId;
if (IS_DNN_OPENCL_TARGET(targetId))
{
#ifndef HAVE_OPENCL
impl->preferableTarget = DNN_TARGET_CPU;
#else
bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
impl->preferableTarget = DNN_TARGET_OPENCL;
#endif
}
impl->netWasAllocated = false; impl->netWasAllocated = false;
impl->clear(); impl->clear();
} }
@ -2222,7 +2265,17 @@ void Net::setInput(InputArray blob, const String& name)
ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) ); ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
MatShape prevShape = shape(ld.outputBlobs[pin.oid]); MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
Mat blob_ = blob.getMat(); Mat blob_;
if (impl->preferableBackend == DNN_BACKEND_DEFAULT &&
impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
{
Mat blob_mat = blob.getMat();
convertFp16(blob_mat, blob_);
}
else
{
blob_ = blob.getMat();
}
bool oldShape = prevShape == shape(blob_); bool oldShape = prevShape == shape(blob_);
if (oldShape) if (oldShape)
{ {
@ -2747,6 +2800,43 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
{
std::vector<UMat> inputs;
std::vector<UMat> outputs;
std::vector<UMat> internals;
std::vector<UMat> orig_inputs;
std::vector<UMat> orig_outputs;
std::vector<UMat> orig_internals;
inputs_arr.getUMatVector(orig_inputs);
outputs_arr.getUMatVector(orig_outputs);
internals_arr.getUMatVector(orig_internals);
inputs.resize(orig_inputs.size());
for (size_t i = 0; i < orig_inputs.size(); i++)
convertFp16(orig_inputs[i], inputs[i]);
outputs.resize(orig_outputs.size());
for (size_t i = 0; i < orig_outputs.size(); i++)
outputs[i].create(shape(orig_outputs[i]), CV_32F);
internals.resize(orig_internals.size());
for (size_t i = 0; i < orig_internals.size(); i++)
internals[i].create(shape(orig_internals[i]), CV_32F);
forward(inputs, outputs, internals);
for (size_t i = 0; i < outputs.size(); i++)
convertFp16(outputs[i], orig_outputs[i]);
// sync results back
outputs_arr.assign(orig_outputs);
internals_arr.assign(orig_internals);
return;
}
std::vector<Mat> inpvec; std::vector<Mat> inpvec;
std::vector<Mat> outputs; std::vector<Mat> outputs;
std::vector<Mat> internals; std::vector<Mat> internals;

@ -120,12 +120,16 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
CV_Assert(blobs.size() >= 2); CV_Assert(blobs.size() >= 2);
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
if (use_half && inputs[0].dims == 2)
return false;
if (umat_weight.empty()) if (umat_weight.empty())
{ {
umat_weight = weights_.getUMat(ACCESS_READ); umat_weight = weights_.getUMat(ACCESS_READ);
@ -139,6 +143,7 @@ public:
int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1; int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1;
int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1; int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1;
String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
if (inpBlob.dims == 2) if (inpBlob.dims == 2)
@ -154,8 +159,12 @@ public:
UMat src = inputs[ii].reshape(1, s.size(), &s[0]); UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
String kname = format("batch_norm%d", number); String kname = format("batch_norm%d", number);
if (number == 1)
buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
else
buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
@ -181,7 +190,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -95,7 +95,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -128,14 +128,14 @@ public:
for( i = 0; i < ninputs; i++ ) for( i = 0; i < ninputs; i++ )
{ {
Mat& inp = *inputs[i]; Mat& inp = *inputs[i];
CV_Assert( inp.isContinuous() && inp.type() == CV_32F && CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
inp.dims == 4 && inp.size[0] == output.size[0] && inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] && inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] ); inp.size[3] == output.size[3] );
nchannels += inp.size[1]; nchannels += inp.size[1];
} }
CV_Assert( nchannels == output.size[1] ); CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && output.type() == CV_32F ); CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
cc.chptrs.resize(nchannels*batchsz); cc.chptrs.resize(nchannels*batchsz);
@ -186,6 +186,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -199,11 +200,12 @@ public:
int num_concats = total(shape(inputs[0]), 0, cAxis); int num_concats = total(shape(inputs[0]), 0, cAxis);
int offset_concat_axis = 0; int offset_concat_axis = 0;
UMat& outMat = outputs[0]; UMat& outMat = outputs[0];
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
String kname = format("concat_%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("concat", ocl::dnn::concat_oclsrc, buildopt); ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
if (kernel.empty()) if (kernel.empty())
return false; return false;
@ -235,7 +237,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -94,7 +94,7 @@ public:
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height); CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
const Mat &input = *inputs[0]; const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F)); CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F || input.type() == CV_16S));
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
CV_Assert(inputs[i]->type() == input.type()); CV_Assert(inputs[i]->type() == input.type());
@ -288,7 +288,7 @@ public:
newActiv = true; newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
if (preferableTarget == DNN_TARGET_OPENCL) if (IS_DNN_OPENCL_TARGET(preferableTarget))
{ {
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>(); Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty()) if (!activ_power.empty())
@ -842,6 +842,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -860,6 +861,7 @@ public:
config.dilation = dilation; config.dilation = dilation;
config.group = inputs[0].size[1] / umat_blobs[0].size[1]; config.group = inputs[0].size[1] / umat_blobs[0].size[1];
config.bias_term = (hasBias()) ? true : false; config.bias_term = (hasBias()) ? true : false;
config.use_half = use_half;
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config)); convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
} }
@ -964,7 +966,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))
@ -1360,6 +1362,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -1450,7 +1455,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -307,8 +307,24 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
inps.getUMatVector(inputs); bool use_half = (inps.depth() == CV_16S);
outs.getUMatVector(outputs); if (use_half)
{
std::vector<UMat> orig_inputs;
std::vector<UMat> orig_outputs;
inps.getUMatVector(orig_inputs);
outs.getUMatVector(orig_outputs);
inputs.resize(orig_inputs.size());
for (size_t i = 0; i < orig_inputs.size(); i++)
convertFp16(orig_inputs[i], inputs[i]);
}
else
{
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
}
std::vector<LabelBBox> allDecodedBBoxes; std::vector<LabelBBox> allDecodedBBoxes;
std::vector<Mat> allConfidenceScores; std::vector<Mat> allConfidenceScores;
@ -342,7 +358,13 @@ public:
{ {
// Set confidences to zeros. // Set confidences to zeros.
Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
outputs[0](ranges).setTo(0); if (use_half)
{
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs[0](ranges).setTo(0);
} else
outputs[0](ranges).setTo(0);
return true; return true;
} }
int outputShape[] = {1, 1, (int)numKept, 7}; int outputShape[] = {1, 1, (int)numKept, 7};
@ -360,9 +382,23 @@ public:
} }
CV_Assert(count == numKept); CV_Assert(count == numKept);
} }
outputs.clear();
outputs.push_back(umat); if (use_half)
outs.assign(outputs); {
UMat half_umat;
convertFp16(umat, half_umat);
std::vector<UMat> orig_outputs;
outs.getUMatVector(orig_outputs);
orig_outputs.clear();
orig_outputs.push_back(half_umat);
outs.assign(orig_outputs);
} else {
outputs.clear();
outputs.push_back(umat);
outs.assign(outputs);
}
return true; return true;
} }
#endif #endif
@ -372,7 +408,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -176,7 +176,7 @@ public:
{ {
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
func.applyOCL(inputs_arr, outputs_arr, internals_arr)) func.applyOCL(inputs_arr, outputs_arr, internals_arr))
@ -223,7 +223,12 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
static String oclGetTMacro(const UMat &m) static String oclGetTMacro(const UMat &m)
{ {
return String("-DT=") + ocl::typeToStr(m.type()) + String(" "); String str_name = ocl::typeToStr(m.type());
if (str_name == "short")
str_name = "half";
return format("-DT=%s -Dconvert_T=convert_%s ", str_name.c_str(), str_name.c_str());
} }
#endif #endif
@ -516,8 +521,28 @@ struct SigmoidFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("SigmoidForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
@ -561,8 +586,28 @@ struct ELUFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("ELUForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif
@ -604,8 +649,28 @@ struct AbsValFunctor
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{ {
// TODO: implement OCL version std::vector<UMat> inputs;
return false; std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
String buildopt = oclGetTMacro(inputs[0]);
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& src = inputs[i];
UMat& dst = outputs[i];
ocl::Kernel kernel("AbsValForward", ocl::dnn::activations_oclsrc, buildopt);
kernel.set(0, (int)src.total());
kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
size_t gSize = src.total();
CV_Assert(kernel.run(1, &gSize, NULL, false));
}
return true;
} }
#endif #endif

@ -271,6 +271,9 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
if (inputs_.depth() == CV_16S && op != SUM)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
@ -284,10 +287,15 @@ public:
{ {
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)channels / 4 * localsize[0] }; size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
String opts;
if (inputs_.depth() == CV_16S)
opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
for (int i = 0; i < (inputs.size() - 1); ++i) for (int i = 0; i < (inputs.size() - 1); ++i)
{ {
String buildopt = format("-DLOOP=%d", i); String buildopt = format("-DLOOP=%d", i) + opts;
ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt); ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
int idx = 0; int idx = 0;
UMat inpMat = (i == 0) ? inputs[0] : UMat(); UMat inpMat = (i == 0) ? inputs[0] : UMat();
@ -306,6 +314,9 @@ public:
} }
else else
{ {
if (inputs_.depth() == CV_16S)
return false;
float coeff1 = coeffs.empty() ? 1.f : coeffs[0]; float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
float coeff2 = coeffs.empty() ? 1.f : coeffs[1]; float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
UMat mul0, mul1; UMat mul0, mul1;
@ -343,7 +354,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -140,7 +140,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
outputs_arr.isUMatVector() && outputs_arr.isUMatVector() &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -64,6 +64,7 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
Ptr<OCL4DNNInnerProduct<float> > innerProductOp; Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
std::vector<UMat> umat_blobs; std::vector<UMat> umat_blobs;
std::vector<UMat> half_blobs;
#endif #endif
FullyConnectedLayerImpl(const LayerParams& params) FullyConnectedLayerImpl(const LayerParams& params)
@ -277,6 +278,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -293,6 +295,17 @@ public:
config.bias_term = bias; config.bias_term = bias;
config.M = outerSize; config.M = outerSize;
config.K = innerSize; config.K = innerSize;
config.use_half = use_half;
if (use_half)
{
half_blobs.resize(umat_blobs.size());
for (int i = 0; i < umat_blobs.size(); i++)
{
if (!umat_blobs[i].empty())
convertFp16(umat_blobs[i], half_blobs[i]);
}
}
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config)); innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
} }
@ -309,13 +322,15 @@ public:
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]); dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f); dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat)) if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
dstMat))
{ {
ret = false; ret = false;
break; break;
} }
if (bias && (outerSize > 1)) if (!use_half && bias && (outerSize > 1))
{ {
UMat& biases = umat_blobs[1]; UMat& biases = umat_blobs[1];
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0); cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
@ -353,7 +368,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -106,6 +106,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -128,6 +129,7 @@ public:
config.height = inputs[0].size[2]; config.height = inputs[0].size[2];
config.width = inputs[0].size[3]; config.width = inputs[0].size[3];
config.norm_by_size = normBySize; config.norm_by_size = normBySize;
config.use_half = use_half;
lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config)); lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
} }
@ -146,7 +148,7 @@ public:
CV_Assert(inputs_arr.total() == outputs_arr.total()); CV_Assert(inputs_arr.total() == outputs_arr.total());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -102,6 +102,9 @@ public:
{ {
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
bool use_half = (inputs[0].depth() == CV_16S);
String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s", use_half ? "half" : "float",
use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4");
int splitDim = (acrossChannels) ? 1 : 2; int splitDim = (acrossChannels) ? 1 : 2;
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
@ -111,12 +114,11 @@ public:
int newRows = total(shape(inpMat), 0, splitDim); int newRows = total(shape(inpMat), 0, splitDim);
MatShape s = shape(newRows, inpMat.total() / newRows); MatShape s = shape(newRows, inpMat.total() / newRows);
UMat oneMat = UMat::ones(s[1], 1, CV_32F); UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
UMat meanMat = UMat(s[0], 1, CV_32F);
UMat tmpMat = UMat(s[0], s[1], CV_32F); UMat tmpMat = UMat(s[0], s[1], CV_32F);
float alpha = 1.0f / s[1]; float alpha = 1.0f / s[1];
String buildopt = "-DNUM=4"; String buildopt = "-DNUM=4" + opts;
ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt); ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] }; size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
@ -167,13 +169,14 @@ public:
int row_size = total(shape(inputs[0]), 0, splitDim); int row_size = total(shape(inputs[0]), 0, splitDim);
int plane_size = total(shape(inputs[0]), splitDim); int plane_size = total(shape(inputs[0]), splitDim);
if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0)) if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
{ return fast_forward_ocl(inputs, outputs);
bool ret = fast_forward_ocl(inputs, outputs);
return ret; if (inputs[0].depth() == CV_16S)
} return false;
UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ); UMat bnorm_weight = scale.empty() ? UMat() : scale.getUMat(ACCESS_READ);
UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ); UMat bnorm_bias = shift.empty() ? UMat() : shift.getUMat(ACCESS_READ);
String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
{ {
@ -195,7 +198,7 @@ public:
int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) }; size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
String buildopt = format("-DNUM=%d", number); String buildopt = format("-DNUM=%d", number) + opts;
if (normVariance) if (normVariance)
{ {
String kname = format("calc_mean%d", number); String kname = format("calc_mean%d", number);
@ -249,7 +252,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -87,6 +87,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -162,7 +165,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -288,9 +288,11 @@ public:
if (!_needsPermute) if (!_needsPermute)
return false; return false;
bool use_half = (inps.depth() == CV_16S);
String opts = format("-DDtype=%s", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc); ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
kernel.set(0, (int)_count); kernel.set(0, (int)_count);
kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i])); kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
@ -313,7 +315,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -147,6 +147,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -164,6 +165,7 @@ public:
(type == AVE ? LIBDNN_POOLING_METHOD_AVE : (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
LIBDNN_POOLING_METHOD_STO); LIBDNN_POOLING_METHOD_STO);
config.avePoolPaddedArea = avePoolPaddedArea; config.avePoolPaddedArea = avePoolPaddedArea;
config.use_half = use_half;
poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config)); poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
} }
@ -189,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -316,6 +316,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
@ -340,9 +341,15 @@ public:
heights.copyTo(umat_heights); heights.copyTo(umat_heights);
} }
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
else
opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
size_t nthreads = _layerHeight * _layerWidth; size_t nthreads = _layerHeight * _layerWidth;
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
kernel.set(1, (float)_stepX); kernel.set(1, (float)_stepX);
kernel.set(2, (float)_stepY); kernel.set(2, (float)_stepY);
@ -375,7 +382,7 @@ public:
// set the variance. // set the variance.
{ {
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc); ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
int offset = total(shape(outputs[0]), 2); int offset = total(shape(outputs[0]), 2);
size_t nthreads = _layerHeight * _layerWidth * _numPriors; size_t nthreads = _layerHeight * _layerWidth * _numPriors;
kernel.set(0, (int)nthreads); kernel.set(0, (int)nthreads);
@ -395,7 +402,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -158,6 +158,9 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
if (inputs_.depth() == CV_16S)
return false;
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals); internals_.getUMatVector(internals);
@ -237,7 +240,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -127,7 +127,7 @@ public:
std::vector<UMat> outputs; std::vector<UMat> outputs;
// TODO: implement a logistic activation to classification scores. // TODO: implement a logistic activation to classification scores.
if (useLogistic) if (useLogistic || inps.depth() == CV_16S)
return false; return false;
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
@ -191,7 +191,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -96,9 +96,10 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inps.depth() == CV_16S);
inps.getUMatVector(inputs); inps.getUMatVector(inputs);
outs.getUMatVector(outputs); outs.getUMatVector(outputs);
String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); String buildopt= format("-DDtype=%s ", use_half ? "half" : "float");
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
@ -134,7 +135,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -219,7 +219,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -181,6 +181,7 @@ public:
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
bool use_half = (inputs_.depth() == CV_16S);
inputs_.getUMatVector(inputs); inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs); outputs_.getUMatVector(outputs);
@ -188,6 +189,11 @@ public:
(total(shape(outputs[0]), 2) % 4 != 0)) (total(shape(outputs[0]), 2) % 4 != 0))
return false; return false;
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
else
opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
const UMat& inpMat = inputs[0]; const UMat& inpMat = inputs[0];
for (size_t i = 0; i < outputs.size(); i++) for (size_t i = 0; i < outputs.size(); i++)
{ {
@ -196,7 +202,7 @@ public:
int rows = outputs[i].size[2]; int rows = outputs[i].size[2];
int cols = outputs[i].size[3]; int cols = outputs[i].size[3];
ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc); ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
size_t local[] = { 128 }; size_t local[] = { 128 };
size_t global[] = { (size_t)groups * channels / 4 * local[0] }; size_t global[] = { (size_t)groups * channels / 4 * local[0] };
int idx = 0; int idx = 0;
@ -222,7 +228,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -99,15 +99,16 @@ public:
softmaxOp.release(); softmaxOp.release();
} }
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays itns) bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
{ {
std::vector<UMat> inputs; std::vector<UMat> inputs;
std::vector<UMat> outputs; std::vector<UMat> outputs;
std::vector<UMat> internals; std::vector<UMat> internals;
inps.getUMatVector(inputs); bool use_half = (inputs_.depth() == CV_16S);
outs.getUMatVector(outputs); inputs_.getUMatVector(inputs);
itns.getUMatVector(internals); outputs_.getUMatVector(outputs);
internals_.getUMatVector(internals);
if (softmaxOp.empty()) if (softmaxOp.empty())
{ {
@ -117,6 +118,7 @@ public:
config.axis = axisRaw; config.axis = axisRaw;
config.channels = inputs[0].size[axisRaw]; config.channels = inputs[0].size[axisRaw];
config.logsoftmax = logSoftMax; config.logsoftmax = logSoftMax;
config.use_half = use_half;
softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config)); softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
} }
@ -128,15 +130,13 @@ public:
return true; return true;
UMat& bufMat = internals[0]; UMat& bufMat = internals[0];
src.copyTo(dstMat);
int axis = clamp(axisRaw, src.dims); int axis = clamp(axisRaw, src.dims);
MatShape s = shape(src); MatShape s = shape(src);
size_t outerSize = total(s, 0, axis); size_t outerSize = total(s, 0, axis);
size_t channels = src.size[axis]; size_t channels = src.size[axis];
size_t innerSize = total(s, axis + 1); size_t innerSize = total(s, axis + 1);
String buildOpts = String("-DT=") + ocl::typeToStr(src.type()); String buildOpts = format("-DT=%s", use_half ? "half" : "float");
ocl::Kernel kmax, ksub, ksum, kdiv; ocl::Kernel kmax, ksub, ksum, kdiv;
if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
@ -152,38 +152,31 @@ public:
if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts)) if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
return false; return false;
size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t bufSize = internals[0].total(); size_t bufSize = internals[0].total();
size_t totalSize = src.total(); size_t totalSize = src.total();
// adjust local/global size size_t internal_globalSize[1] = { bufSize };
size_t internal_localSize[1] = { (bufSize == 1) ? 1 : wgSize }; size_t total_globalSize[1] = { totalSize };
size_t internal_globalSize[1] = { divUp(bufSize, (unsigned int)internal_localSize[0]) * internal_localSize[0] };
// adjust local/global size (total)
size_t total_localSize[1] = { (totalSize == 1) ? 1 : wgSize };
size_t total_globalSize[1] = { divUp(totalSize, (unsigned int)total_localSize[0]) * total_localSize[0] };
kmax.args((int)outerSize, (int)channels, (int)innerSize, kmax.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
if (!kmax.run(1, internal_globalSize, internal_localSize, false)) if (!kmax.run(1, internal_globalSize, NULL, false))
return false; return false;
ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat),
if (!ksub.run(1, total_globalSize, total_localSize, false)) ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
if (!ksub.run(1, total_globalSize, NULL, false))
return false; return false;
cv::exp(dstMat, dstMat);
ksum.args((int)outerSize, (int)channels, (int)innerSize, ksum.args((int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat)); ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
if (!ksum.run(1, internal_globalSize, internal_localSize, false)) if (!ksum.run(1, internal_globalSize, NULL, false))
return false; return false;
kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize, kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat)); ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
if (!kdiv.run(1, total_globalSize, total_localSize, false)) if (!kdiv.run(1, total_globalSize, NULL, false))
return false; return false;
return true; return true;
@ -195,7 +188,7 @@ public:
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr)) forward_ocl(inputs_arr, outputs_arr, internals_arr))

@ -59,7 +59,8 @@ struct OCL4DNNConvConfig
stride(1, 1), stride(1, 1),
dilation(1, 1), dilation(1, 1),
group(1), group(1),
bias_term(false) bias_term(false),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
@ -69,6 +70,7 @@ struct OCL4DNNConvConfig
Size dilation; Size dilation;
int group; // = 1; int group; // = 1;
bool bias_term; // = false; bool bias_term; // = false;
bool use_half; // = false;
}; };
typedef enum { typedef enum {
@ -272,6 +274,8 @@ class OCL4DNNConvSpatial
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
UMat swizzled_weights_umat; UMat swizzled_weights_umat;
UMat weights_half;
UMat bias_half;
UMat bottom_data2_; UMat bottom_data2_;
int32_t bottom_index_; int32_t bottom_index_;
@ -327,6 +331,7 @@ class OCL4DNNConvSpatial
ocl4dnnFusedActiv_t fused_activ_; ocl4dnnFusedActiv_t fused_activ_;
float power_; float power_;
bool fused_eltwise_; bool fused_eltwise_;
bool use_half_;
}; };
typedef enum { typedef enum {
@ -345,7 +350,8 @@ struct OCL4DNNPoolConfig
channels(0), channels(0),
pool_method(LIBDNN_POOLING_METHOD_MAX), pool_method(LIBDNN_POOLING_METHOD_MAX),
global_pooling(false), global_pooling(false),
avePoolPaddedArea(false) avePoolPaddedArea(true),
use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
MatShape out_shape; MatShape out_shape;
@ -358,6 +364,7 @@ struct OCL4DNNPoolConfig
ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX; ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
bool global_pooling; // = false; bool global_pooling; // = false;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -391,13 +398,14 @@ class OCL4DNNPool
int32_t pooled_height_; int32_t pooled_height_;
int32_t pooled_width_; int32_t pooled_width_;
bool avePoolPaddedArea; bool avePoolPaddedArea;
bool use_half;
}; };
struct OCL4DNNInnerProductConfig struct OCL4DNNInnerProductConfig
{ {
OCL4DNNInnerProductConfig() : OCL4DNNInnerProductConfig() :
num_output(0), M(0), K(0), num_output(0), M(0), K(0),
bias_term(false), transpose(false), phase_test(true) bias_term(false), transpose(false), phase_test(true), use_half(false)
{} {}
int num_output; int num_output;
int M; int M;
@ -405,6 +413,7 @@ struct OCL4DNNInnerProductConfig
bool bias_term; bool bias_term;
bool transpose; // = false; bool transpose; // = false;
bool phase_test; // = true; bool phase_test; // = true;
bool use_half; // = false;
}; };
template<typename Dtype> template<typename Dtype>
@ -428,6 +437,7 @@ class OCL4DNNInnerProduct
bool transpose_; bool transpose_;
bool image_copied_; bool image_copied_;
bool phase_test_; bool phase_test_;
bool use_half_;
}; };
typedef enum { typedef enum {
@ -441,7 +451,7 @@ struct OCL4DNNLRNConfig
lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS), lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
phase_test(true), phase_test(true),
local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false), local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
batch_size(0), channels(0), height(0), width(0) batch_size(0), channels(0), height(0), width(0), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type; LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
@ -455,6 +465,7 @@ struct OCL4DNNLRNConfig
int32_t channels; int32_t channels;
int32_t height; int32_t height;
int32_t width; int32_t width;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -477,16 +488,18 @@ class OCL4DNNLRN
int32_t height_; int32_t height_;
int32_t width_; int32_t width_;
bool norm_by_size_; bool norm_by_size_;
bool use_half_;
}; };
struct OCL4DNNSoftmaxConfig struct OCL4DNNSoftmaxConfig
{ {
OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false) OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
{} {}
MatShape in_shape; MatShape in_shape;
int axis; int axis;
int channels; int channels;
bool logsoftmax; bool logsoftmax;
bool use_half;
}; };
template<typename Dtype> template<typename Dtype>
@ -506,6 +519,7 @@ class OCL4DNNSoftmax
bool use_slm_; bool use_slm_;
bool log_softmax_; bool log_softmax_;
UMat scale_data_; UMat scale_data_;
bool use_half_;
}; };
}}} // namespace cv::dnn::ocl4dnn }}} // namespace cv::dnn::ocl4dnn

@ -48,6 +48,12 @@
namespace cv { namespace dnn { namespace ocl4dnn { namespace cv { namespace dnn { namespace ocl4dnn {
enum gemm_data_type_t
{
TYPE_FLOAT = 1,
TYPE_HALF = 2
};
// Create and copy buffer to image for GEMM's matrix A and B. // Create and copy buffer to image for GEMM's matrix A and B.
// Will return image to caller if the input image is NULL. Otherwise, // Will return image to caller if the input image is NULL. Otherwise,
// will use the image directly. It's caller's responsibility to // will use the image directly. It's caller's responsibility to
@ -60,6 +66,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
int width, int ld) int width, int ld)
{ {
ocl::Image2D image; ocl::Image2D image;
String opts = format("-DTYPE=%d", TYPE_FLOAT);
if (!is_matrix_a && transpose) if (!is_matrix_a && transpose)
{ {
@ -73,7 +80,8 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
UMat mat(height, width, CV_32FC1); UMat mat(height, width, CV_32FC1);
image = ocl::Image2D(mat); image = ocl::Image2D(mat);
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc); ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float",
ocl::dnn::gemm_image_oclsrc, opts);
size_t global_copy[2]; size_t global_copy[2];
global_copy[0] = width; global_copy[0] = width;
@ -96,7 +104,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
image = ocl::Image2D(mat); image = ocl::Image2D(mat);
ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float", ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
ocl::dnn::gemm_image_oclsrc); ocl::dnn::gemm_image_oclsrc, opts);
size_t global_copy[2]; size_t global_copy[2];
global_copy[0] = padded_width; global_copy[0] = padded_width;
@ -129,7 +137,7 @@ enum gemm_type_t
GEMM_TYPE_FAST_IMAGE_32_1, GEMM_TYPE_FAST_IMAGE_32_1,
GEMM_TYPE_FAST_IMAGE_32_2, GEMM_TYPE_FAST_IMAGE_32_2,
GEMM_TYPE_FAST_IMAGE_B_IMAGE, GEMM_TYPE_FAST_IMAGE_B_IMAGE,
GEMM_TYPE_MAX GEMM_TYPE_FAST_BUFFER
}; };
template<typename Dtype> template<typename Dtype>
@ -145,6 +153,8 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 || CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl; gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
bool halfPrecisionMode = (A.depth() == CV_16S);
if (is_image_a) if (is_image_a)
{ {
CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl; CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
@ -157,6 +167,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
return false; return false;
} }
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
int widthA = (TransA == CblasNoTrans) ? K : M; int widthA = (TransA == CblasNoTrans) ? K : M;
int heightA = (TransA == CblasNoTrans) ? M : K; int heightA = (TransA == CblasNoTrans) ? M : K;
int widthB = (TransB == CblasNoTrans) ? N : K; int widthB = (TransB == CblasNoTrans) ? N : K;
@ -178,7 +189,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
int blockC_width = blocksize; int blockC_width = blocksize;
int blockC_height = blocksize; int blockC_height = blocksize;
int use_buffer_indicator = 8; int use_buffer_indicator = (halfPrecisionMode) ? 16 : 8;
// To fix the edge problem caused by the sub group block read. // To fix the edge problem caused by the sub group block read.
// we have to pad the image if it's not multiple of tile. // we have to pad the image if it's not multiple of tile.
// just padding one line is enough as the sub group block read // just padding one line is enough as the sub group block read
@ -221,9 +232,13 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
else else
kernel_name += "1"; kernel_name += "1";
kernel_name += "_float"; if (halfPrecisionMode) {
kernel_name += "_half";
} else {
kernel_name += "_float";
}
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc); ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc, opts);
if (oclk_gemm_float.empty()) if (oclk_gemm_float.empty())
return false; return false;
@ -255,6 +270,10 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
bool padding_A = false; bool padding_A = false;
bool padding_B = false; bool padding_B = false;
if (halfPrecisionMode && is_image_b) {
padding_A = true;
}
if (!is_image_a && !is_image_b) if (!is_image_a && !is_image_b)
{ {
if (M * K < N * K) if (M * K < N * K)
@ -265,17 +284,19 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
if (!is_image_a) if (!is_image_a)
{ {
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset, if (!halfPrecisionMode)
true, TransA != CblasNoTrans, ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
padding_A, imageA_h, imageA_w, true, TransA != CblasNoTrans,
blockA_height, blockA_width, ldA); padding_A, imageA_h, imageA_w,
blockA_height, blockA_width, ldA);
} }
if (!is_image_b) if (!is_image_b)
{ {
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset, if (!halfPrecisionMode)
false, false, ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
padding_B, imageB_h, imageB_w, false, false,
blockB_height, blockB_width, ldB); padding_B, imageB_h, imageB_w,
blockB_height, blockB_width, ldB);
} }
} else { } else {
// We will use normal read_imagef to read image B when B has transpose. // We will use normal read_imagef to read image B when B has transpose.
@ -283,32 +304,48 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
if (!is_image_a) if (!is_image_a)
{ {
bool padding; bool padding;
padding = !is_image_b; padding = !is_image_b || halfPrecisionMode;
ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset, if (!halfPrecisionMode)
true, TransA != CblasNoTrans, ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
padding, imageA_h, imageA_w, true, TransA != CblasNoTrans,
blockA_height, blockA_width, ldA); padding, imageA_h, imageA_w,
blockA_height, blockA_width, ldA);
} }
if (!is_image_b && (K % use_buffer_indicator != 0)) if (!is_image_b && (K % use_buffer_indicator != 0))
{ {
ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset, if (!halfPrecisionMode)
false, true, false, imageB_h, imageB_w, ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
blockB_height, blockB_width, ldB); false, true, false,
imageB_h, imageB_w,
blockB_height, blockB_width, ldB);
} }
} }
size_t global[2]; size_t global[2];
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE) if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
{ {
global[0] = (size_t)( blockC_width + 7 ) & ~7; if (halfPrecisionMode) {
global[0] = (size_t)( blockC_width + 15 ) & ~15;
} else {
global[0] = (size_t)( blockC_width + 7 ) & ~7;
}
} else { } else {
global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7; if (halfPrecisionMode) {
global[0] = (size_t)( (blockC_width / 2 ) + 15 ) ^ ~15;
} else {
global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
}
} }
global[1] = (size_t)(blockC_height + 31) / 32; global[1] = (size_t)(blockC_height + 31) / 32;
size_t local[2]; size_t local[2];
local[0] = 8; if (halfPrecisionMode)
{
local[0] = 16;
} else {
local[0] = 8;
}
local[1] = 1; local[1] = 1;
cl_uint arg_idx = 0; cl_uint arg_idx = 0;
@ -385,6 +422,101 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
return true; return true;
} }
template<typename Dtype>
static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int32_t M,
const int32_t N, const int32_t K, const Dtype alpha,
const UMat A, const int32_t offA, const UMat B,
const int32_t offB, const Dtype beta, UMat C,
const int32_t offC, enum gemm_type_t gemm_type)
{
CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
<< "Invalid fast buffer gemm type." << std::endl;
bool halfPrecisionMode = (A.depth() == CV_16S);
size_t sub_group_size = 8;
bool is_small_batch = (M == 2 || M == 4 || M == 8);
String kernel_name("gemm_buffer_");
if (TransA == CblasNoTrans && TransB == CblasNoTrans) {
kernel_name += "NN";
if (halfPrecisionMode) {
sub_group_size = 16;
}
} else if (TransA == CblasNoTrans && TransB != CblasNoTrans) {
if (M == 2)
kernel_name +="NT_M_2";
else if (M == 4)
kernel_name +="NT_M_4";
else if (M == 8)
kernel_name +="NT_M_8";
else
kernel_name += "NT";
}
if (halfPrecisionMode) {
kernel_name += "_half";
} else {
kernel_name += "_float";
}
String opts = format("-DTYPE=%d", halfPrecisionMode ? TYPE_HALF : TYPE_FLOAT);
ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_buffer_oclsrc, opts);
size_t local[2] = {};
size_t global[2] = {};
if (TransA == CblasNoTrans && TransB != CblasNoTrans && is_small_batch) {
if (M == 8)
local[0] = 16;
else if (M == 4)
local[0] = 32;
else
local[0] = 64;
local[1] = 1;
if (M == 8)
global[0] = N * local[0];
else
global[0] = (N + 3) / 4 * local[0];
global[1] = 1;
} else {
size_t lx = sub_group_size;
size_t ly = (TransB != CblasNoTrans && TransA == CblasNoTrans && halfPrecisionMode) ? 2 : 4;
int dx = (TransB != CblasNoTrans && TransA == CblasNoTrans) ? 1 : 4;
int dy = 8;
size_t gx = (size_t)(N + dx - 1) / dx;
size_t gy = (size_t)(M + dy - 1) / dy;
global[0] = (gx + lx - 1) / lx * lx;
global[1] = (gy + ly - 1) / ly * ly;
local[0] = lx;
local[1] = ly;
}
int arg_idx = 0;
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
oclk_gemm_float.set(arg_idx++, offA);
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
oclk_gemm_float.set(arg_idx++, offB);
oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
oclk_gemm_float.set(arg_idx++, offC);
oclk_gemm_float.set(arg_idx++, M);
oclk_gemm_float.set(arg_idx++, N);
oclk_gemm_float.set(arg_idx++, K);
oclk_gemm_float.set(arg_idx++, (float)alpha);
oclk_gemm_float.set(arg_idx++, (float)beta);
bool ret;
if (TransB == CblasNoTrans || TransA != CblasNoTrans) {
int stride = 256;
for (int start_index = 0; start_index < K; start_index += stride) {
oclk_gemm_float.set(arg_idx, start_index);
ret = oclk_gemm_float.run(2, global, local, false);
}
} else {
ret = oclk_gemm_float.run(2, global, local, false);
}
return ret;
}
template<typename Dtype> template<typename Dtype>
bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB, bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
const int32_t M, const int32_t N, const int32_t K, const int32_t M, const int32_t N, const int32_t K,
@ -392,7 +524,8 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
const UMat B_image, UMat C, const UMat B_image, UMat C,
const size_t max_image_size) const size_t max_image_size)
{ {
gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1; bool halfPrecisionMode = (A.depth() == CV_16S);
gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
gemm_type == GEMM_TYPE_FAST_IMAGE_32_2) gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
@ -409,6 +542,11 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
GEMM_TYPE_FAST_IMAGE_B_IMAGE, GEMM_TYPE_FAST_IMAGE_B_IMAGE,
max_image_size); max_image_size);
} }
else if (gemm_type == GEMM_TYPE_FAST_BUFFER)
{
return ocl4dnnFastBufferGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
1.f, A, 0, B, 0, 0.f, C, 0, gemm_type);
}
return false; return false;
} }
@ -436,10 +574,17 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
const int32_t offy) const int32_t offy)
{ {
bool ret = false; bool ret = false;
bool use_half = (A.depth() == CV_16S);
String opts;
if (use_half)
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
else
opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "float", "float4", "float");
if (TransA == CblasNoTrans) if (TransA == CblasNoTrans)
{ {
ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc); String kname = format("matvec_mul4_%s", use_half ? "half" : "float");
ocl::Kernel k(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
if (k.empty()) if (k.empty())
return false; return false;
@ -469,7 +614,8 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
if ((row_size % 4) != 0 && ret) if ((row_size % 4) != 0 && ret)
{ {
ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc); String kname = format("matvec_mul1_%s", use_half ? "half" : "float");
ocl::Kernel k_1(kname.c_str(), cv::ocl::dnn::matvec_mul_oclsrc, opts);
size_t localsize[] = { 128 }; size_t localsize[] = { 128 };
size_t globalsize[] = { row_size % 4 * localsize[0] }; size_t globalsize[] = { row_size % 4 * localsize[0] };
uint row_offset = row_size - (row_size % 4); uint row_offset = row_size - (row_size % 4);
@ -499,7 +645,15 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
const UMat X, const int32_t offX, UMat Y, const UMat X, const int32_t offX, UMat Y,
const int32_t offY) const int32_t offY)
{ {
ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc); bool use_half = (X.depth() == CV_16S);
String opts;
if (use_half)
opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
else
opts = "-DDtype=float -DDtype4=float4 -Dconvert_Dtype=convert_float";
String kname = format("axpy_%s", use_half ? "half" : "float");
ocl::Kernel oclk_axpy(kname.c_str(), cv::ocl::dnn::math_oclsrc, opts);
if (oclk_axpy.empty()) if (oclk_axpy.empty())
return false; return false;

@ -54,6 +54,7 @@
#include "opencl_kernels_dnn.hpp" #include "opencl_kernels_dnn.hpp"
#include "../include/math_functions.hpp" #include "../include/math_functions.hpp"
#include "../include/default_kernel_config.hpp" #include "../include/default_kernel_config.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#if defined WIN32 || defined _WIN32 #if defined WIN32 || defined _WIN32
#include <windows.h> #include <windows.h>
@ -85,6 +86,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
max_value_ = 0; max_value_ = 0;
prev_kernel_type_ = -1; prev_kernel_type_ = -1;
tuned_ = false; tuned_ = false;
use_half_ = config.use_half;
// assumption: spatial dimension is 2. // assumption: spatial dimension is 2.
kernel_h_ = config.kernel.height; kernel_h_ = config.kernel.height;
@ -204,18 +206,40 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
return; return;
} }
typedef enum {
TYPE_FLOAT = 1,
TYPE_HALF = 2
} ocl4dnnConvSpatialType_t;
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation() void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
{ {
addDef("Dtype", "float"); if (use_half_)
addDef("Dtype2", "float2"); {
addDef("Dtype4", "float4"); addDef("TYPE", TYPE_HALF);
addDef("Dtype8", "float8"); addDef("Dtype", "half");
addDef("Dtype16", "float16"); addDef("Dtype2", "half2");
addDef("as_Dtype", "as_float"); addDef("Dtype4", "half4");
addDef("as_Dtype2", "as_float2"); addDef("Dtype8", "half8");
addDef("as_Dtype4", "as_float4"); addDef("Dtype16", "half16");
addDef("as_Dtype8", "as_float8"); addDef("as_Dtype", "as_half");
addDef("as_Dtype2", "as_half2");
addDef("as_Dtype4", "as_half4");
addDef("as_Dtype8", "as_half8");
}
else
{
addDef("TYPE", TYPE_FLOAT);
addDef("Dtype", "float");
addDef("Dtype2", "float2");
addDef("Dtype4", "float4");
addDef("Dtype8", "float8");
addDef("Dtype16", "float16");
addDef("as_Dtype", "as_float");
addDef("as_Dtype2", "as_float2");
addDef("as_Dtype4", "as_float4");
addDef("as_Dtype8", "as_float8");
}
} }
typedef enum { typedef enum {
@ -477,10 +501,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
fused_eltwise_ = false; fused_eltwise_ = false;
} }
prepareKernel(bottom, top, weight, bias, numImages); if (use_half_ && bias_half.empty() && !bias.empty())
convertFp16((UMat&)bias, bias_half);
if (use_half_ && weights_half.empty())
convertFp16((UMat&)weight, weights_half);
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
if (bestKernelConfig.empty()) if (bestKernelConfig.empty())
return false; return false;
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig); return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
} }
template<typename Dtype> template<typename Dtype>
@ -556,6 +586,12 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
<< "_" << blockWidth << "_" << blockWidth
<< "_" << blockHeight << "_" << blockHeight
<< "_" << blockDepth; << "_" << blockDepth;
if (!use_half_)
keyBuilder << "_float";
else
keyBuilder << "_half";
return keyBuilder.str(); return keyBuilder.str();
} }
@ -637,9 +673,13 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
if (swizzled_weights_umat.empty()) if (swizzled_weights_umat.empty())
swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ * swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1); kernel_h_ * (int)alignSize(kernel_w_, 2),
(use_half_) ? CV_16SC1 : CV_32FC1);
UMat swizzled_weights_tmp;
if (use_half_)
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
ocl::Queue queue = ocl::Queue::getDefault();
if (!interleave) { if (!interleave) {
cl_uint argIdx = 0; cl_uint argIdx = 0;
int32_t channels = channels_ / group_; int32_t channels = channels_ / group_;
@ -650,7 +690,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
return false; return false;
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat)); if (use_half_)
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
else
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
oclk_copy_weight.set(argIdx++, kernel_w_); oclk_copy_weight.set(argIdx++, kernel_w_);
oclk_copy_weight.set(argIdx++, kernel_h_); oclk_copy_weight.set(argIdx++, kernel_h_);
oclk_copy_weight.set(argIdx++, channels); oclk_copy_weight.set(argIdx++, channels);
@ -669,7 +712,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
// assumption: kernel dimesion is 2 // assumption: kernel dimesion is 2
Mat weightMat = weight.getMat(ACCESS_READ); Mat weightMat = weight.getMat(ACCESS_READ);
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>(); Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE); Mat swizzledWeightMat;
if (use_half_)
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
else
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>(); Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
int interleavedRows = (kernel_w_ / 2) * 2; int interleavedRows = (kernel_w_ / 2) * 2;
@ -694,6 +741,10 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
rowAlignment); rowAlignment);
free(tmpSwizzledWeight); free(tmpSwizzledWeight);
} }
if (use_half_)
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
return true; return true;
} }
@ -727,9 +778,10 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
cl_mem sub_mem; cl_mem sub_mem;
cl_buffer_region region; cl_buffer_region region;
cl_int err; cl_int err;
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
region.origin = offset * sizeof(float); region.origin = offset * element_size;
region.size = size * sizeof(float); region.size = size * element_size;
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ), sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY, write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err); CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
@ -739,8 +791,9 @@ void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_bu
return; return;
} }
int step = sizeof(float), rows = size, cols = 1; int step = element_size, rows = size, cols = 1;
ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer); ocl::convertFromBuffer(sub_mem, step, rows, cols,
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
//decrease ocl mem refcount //decrease ocl mem refcount
clReleaseMemObject(sub_mem); clReleaseMemObject(sub_mem);
@ -978,7 +1031,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_) if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@ -1018,7 +1074,10 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, kernel_offset); kernel.set(argIdx++, kernel_offset);
if (bias_term_) if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1132,14 +1191,27 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
return false; return false;
int32_t sz[4] = {numImages, num_output_, output_h_, output_w_}; int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
top.zeros(4, sz, CV_32FC1); top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
bool saved_tuned = tuned_; bool saved_tuned = tuned_;
tuned_ = false; tuned_ = false;
convolve(bottom, top, weight, bias, numImages, config); convolve(bottom, top, weight, bias, numImages, config);
tuned_ = saved_tuned; tuned_ = saved_tuned;
float *data = (float *)top.getMat(ACCESS_READ).ptr<float>(); UMat new_top, new_verify_top;
float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>(); float *data, *verify_data;
if (use_half_)
{
convertFp16(top, new_top);
convertFp16(verifyTop, new_verify_top);
data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
}
else
{
data = (float *)top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
}
for (int32_t n = 0; n < num_; ++n) { for (int32_t n = 0; n < num_; ++n) {
for (int32_t g = 0; g < group_; ++g) { for (int32_t g = 0; g < group_; ++g) {
@ -1148,9 +1220,19 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
for (int h = 0; h < output_h_ && !verificationFail; h++) for (int h = 0; h < output_h_ && !verificationFail; h++)
for (int w = 0; w < output_w_; w++) { for (int w = 0; w < output_w_; w++) {
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && float error_factor = fabs(data[offset] - verify_data[offset]);
fabs(data[offset] - verify_data[offset]) < 1.e-4)) if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n",
n, g, out_ch, h, w, data[offset], verify_data[offset]));
verificationFail = 1;
goto out;
}
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{ {
dbgPrint(printf("test verification failed @ image %d group %d" dbgPrint(printf("test verification failed @ image %d group %d"
"out_ch %d h %d w %d got %G expected %G\n", "out_ch %d h %d w %d got %G expected %G\n",
@ -1719,15 +1801,16 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
if (loadTunedConfig()) // check external storage if (loadTunedConfig()) // check external storage
return; return;
UMat benchData(1, numImages * top_dim_, CV_32FC1); UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
if (force_auto_tuning_) if (force_auto_tuning_)
{ {
calculateBenchmark(bottom, benchData, weight, bias, numImages);
setupConvolution(bottom, top, weight, bias, numImages, benchData); setupConvolution(bottom, top, weight, bias, numImages, benchData);
} }
else else
{ {
calculateBenchmark(bottom, benchData, weight, bias, numImages);
useFirstAvailable(bottom, top, weight, bias, numImages, benchData); useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
} }
cacheTunedConfig(); cacheTunedConfig();

@ -56,6 +56,7 @@ OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config
K_ = config.K; K_ = config.K;
phase_test_ = config.phase_test; phase_test_ = config.phase_test;
image_copied_ = false; image_copied_ = false;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
@ -89,13 +90,24 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
if (M_ <= max_image_size && if (M_ <= max_image_size &&
N_ <= max_image_size && N_ <= max_image_size &&
K_ <= max_image_size && K_ <= max_image_size &&
cv::traits::Depth<Dtype>::value == CV_32F &&
ocl::Device::getDefault().intelSubgroupsSupport()) ocl::Device::getDefault().intelSubgroupsSupport())
{ {
ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans, ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
M_, N_, K_, bottom, weight, UMat(), top, M_, N_, K_, bottom, weight, UMat(), top,
max_image_size); max_image_size);
} }
if (use_half_ && bias_term_)
{
UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
UMat newbias, tmpTop;
convertFp16(bias, newbias);
convertFp16(top, tmpTop);
cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
convertFp16(tmpTop, top);
}
return ret; return ret;
} }
} }

@ -61,6 +61,7 @@ OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
channels_ = config.channels; channels_ = config.channels;
height_ = config.height; height_ = config.height;
width_ = config.width; width_ = config.width;
use_half_ = config.use_half;
} }
template<typename Dtype> template<typename Dtype>
@ -97,8 +98,10 @@ bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
int32_t n_threads = num_ * height_ * width_; int32_t n_threads = num_ * height_ * width_;
size_t global_work_size_[1] = {(size_t)n_threads}; size_t global_work_size_[1] = {(size_t)n_threads};
String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : ""; String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
opts += format("-D Dtype=%s", (use_half_) ? "half" : "float");
ocl::Kernel oclk_lrn_fill; ocl::Kernel oclk_lrn_fill;
if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts)) String kname = format("lrn_full_no_scale_%s", (use_half_) ? "half" : "float");
if (!oclk_lrn_fill.create(kname.c_str(), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
return false; return false;
oclk_lrn_fill.set(argIdx++, n_threads); oclk_lrn_fill.set(argIdx++, n_threads);

@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
channels_ = config.channels; channels_ = config.channels;
pool_method_ = config.pool_method; pool_method_ = config.pool_method;
avePoolPaddedArea = config.avePoolPaddedArea; avePoolPaddedArea = config.avePoolPaddedArea;
use_half = config.use_half;
for (int i = 0; i < spatial_dims; ++i) for (int i = 0; i < spatial_dims; ++i)
{ {
@ -105,12 +106,15 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
case LIBDNN_POOLING_METHOD_MAX: case LIBDNN_POOLING_METHOD_MAX:
{ {
bool haveMask = !top_mask.empty(); bool haveMask = !top_mask.empty();
String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
kname += (use_half) ? "_half" : "_float";
ocl::Kernel oclk_max_pool_forward( ocl::Kernel oclk_max_pool_forward(
haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"), kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
@ -139,11 +143,14 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), String kname = format("ave_pool_forward_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_ave_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format(" -D Dtype=%s -D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s", " -D PAD_W=%d -D PAD_H=%d%s",
(use_half) ? "half" : "float",
kernel_w_, kernel_h_, kernel_w_, kernel_h_,
stride_w_, stride_h_, stride_w_, stride_h_,
pad_w_, pad_h_, pad_w_, pad_h_,
@ -171,7 +178,9 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
{ {
CV_Assert(top_mask.empty()); CV_Assert(top_mask.empty());
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), String kname = format("sto_pool_forward_test_%s", (use_half) ? "half" : "float");
ocl::Kernel oclk_sto_pool_forward(
kname.c_str(),
ocl::dnn::ocl4dnn_pooling_oclsrc, ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d", " -D STRIDE_W=%d -D STRIDE_H=%d",

@ -52,6 +52,7 @@ OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
softmax_axis_ = config.axis; softmax_axis_ = config.axis;
channels_ = config.channels; channels_ = config.channels;
log_softmax_ = config.logsoftmax; log_softmax_ = config.logsoftmax;
use_half_ = config.use_half;
inner_num_ = 1; inner_num_ = 1;
outer_num_ = 1; outer_num_ = 1;
@ -91,10 +92,13 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
if (log_softmax_) opts += " -DLOG_SOFTMAX "; if (log_softmax_) opts += " -DLOG_SOFTMAX ";
if (use_slm_) if (use_slm_)
kname = CL_KERNEL_SELECT("softmax_forward_slm"); kname = "softmax_forward_slm";
else else
kname = CL_KERNEL_SELECT("softmax_forward"); kname = "softmax_forward";
kname += format("%s", (use_half_) ? "_half" : "_float");
opts += format(" -D Dtype=%s -D DTYPE_MAX=%s", (use_half_) ? "half" : "float",
(use_half_) ? "HALF_MAX" : "FLT_MAX");
if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts)) if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
return false; return false;

@ -40,9 +40,17 @@
// //
//M*/ //M*/
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
#define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void ReLUForward(const int count, __global const T* in, __global T* out __kernel void ReLUForward(const int count, __global const T* in, __global T* out
#ifndef RELU_NO_SLOPE #ifndef RELU_NO_SLOPE
, T negative_slope , KERNEL_ARG_DTYPE negative_slope
#endif #endif
) { ) {
int index = get_global_id(0); int index = get_global_id(0);
@ -55,18 +63,19 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
} }
__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out, __kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
const T minValue, const T maxValue) const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
{ {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) if(index < count)
{ {
T x = in[index]; T x = in[index];
out[index] = clamp(x, minValue, maxValue); out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
} }
} }
__kernel void PReLUForward(const int count, const int channels, const int plane_size, __kernel void PReLUForward(const int count, const int channels, const int plane_size,
__global const T* in, __global T* out, __global const T* slope_data) __global const T* in, __global T* out,
__global const KERNEL_ARG_DTYPE* slope_data)
{ {
int index = get_global_id(0); int index = get_global_id(0);
int c = (index / plane_size) % channels; int c = (index / plane_size) % channels;
@ -99,8 +108,22 @@ __kernel void AbsValForward(const int n, __global const T* in, __global T* out)
out[index] = fabs(in[index]); out[index] = fabs(in[index]);
} }
__kernel void PowForward(const int n, __global const T* in, __global T* out, const T power, const T scale, const T shift) { __kernel void PowForward(const int n, __global const T* in, __global T* out,
const KERNEL_ARG_DTYPE power,
const KERNEL_ARG_DTYPE scale,
const KERNEL_ARG_DTYPE shift)
{
int index = get_global_id(0); int index = get_global_id(0);
if (index < n) if (index < n)
out[index] = pow(shift + scale * in[index], power); out[index] = pow(shift + scale * in[index], power);
} }
__kernel void ELUForward(const int n, __global const T* in, __global T* out)
{
int index = get_global_id(0);
if (index < n)
{
T src = in[index];
out[index] = (src >= 0.f) ? src : exp(src) - 1;
}
}

@ -40,24 +40,27 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
#if NUM == 8 #if NUM == 8
#define load(src, index) vload8(0, src + index) #define load(src, index) vload8(0, src + index)
#define store(vec, dst, index) vstore8(vec, 0, dst + index) #define store(vec, dst, index) vstore8(vec, 0, dst + index)
#define vec_type Dtype8 #define float_type float8
#define convert_f convert_float8
#define BATCH_NORM batch_norm8 #define BATCH_NORM batch_norm8
#elif NUM == 4 #elif NUM == 4
#define load(src, index) vload4(0, src + index) #define load(src, index) vload4(0, src + index)
#define store(vec, dst, index) vstore4(vec, 0, dst + index) #define store(vec, dst, index) vstore4(vec, 0, dst + index)
#define vec_type Dtype4 #define float_type float4
#define convert_f convert_float4
#define BATCH_NORM batch_norm4 #define BATCH_NORM batch_norm4
#elif NUM == 1 #elif NUM == 1
#define load(src, index) src[index] #define load(src, index) src[index]
#define store(vec, dst, index) dst[index] = vec #define store(vec, dst, index) dst[index] = vec
#define vec_type Dtype #define float_type float
#define convert_f convert_float
#define BATCH_NORM batch_norm1 #define BATCH_NORM batch_norm1
#endif #endif
@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src,
const int rows, const int rows,
const int cols, const int cols,
const int channels, const int channels,
__global const Dtype* weight, __global const float* weight,
__global const Dtype* bias, __global const float* bias,
__global Dtype* dst) __global Dtype* dst)
{ {
int x = get_global_id(0); int x = get_global_id(0);
@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src,
if (x >= rows || y >= cols) if (x >= rows || y >= cols)
return; return;
Dtype w = weight[x % channels]; float w = weight[x % channels];
Dtype b = bias[x % channels]; float b = bias[x % channels];
vec_type src_vec = load(src, index); float_type src_vec = convert_f(load(src, index));
vec_type dst_vec = src_vec * w + (vec_type)b; float_type dst_vec = src_vec * w + (float_type)b;
store(dst_vec, dst, index); store(convert_T(dst_vec), dst, index);
} }

@ -39,22 +39,29 @@
// //
//M*/ //M*/
__kernel void concat(const int nthreads, #if defined(cl_khr_fp16)
__global const Dtype* in_data, #pragma OPENCL EXTENSION cl_khr_fp16 : enable
const int num_concats, #endif
const int concat_size,
const int top_concat_axis,
const int bottom_concat_axis,
const int offset_concat_axis,
__global Dtype* out_data) {
for (int index = get_global_id(0); index < nthreads; #define CONCAT(A,B) A##_##B
index += get_global_size(0)) { #define TEMPLATE(name,type) CONCAT(name,type)
const int total_concat_size = concat_size * bottom_concat_axis;
const int concat_num = index / total_concat_size; __kernel void TEMPLATE(concat, Dtype)(const int nthreads,
const int concat_index = index % total_concat_size; __global const Dtype* in_data,
const int top_index = concat_index const int num_concats,
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size; const int concat_size,
out_data[top_index] = in_data[index]; const int top_concat_axis,
} const int bottom_concat_axis,
const int offset_concat_axis,
__global Dtype* out_data)
{
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
{
const int total_concat_size = concat_size * bottom_concat_axis;
const int concat_num = index / total_concat_size;
const int concat_index = index % total_concat_size;
const int top_index = concat_index +
(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
out_data[top_index] = in_data[index];
}
} }

@ -40,27 +40,29 @@
// //
//M*/ //M*/
#if APPLY_BIAS #if defined(cl_khr_fp16)
#define BIAS_KERNEL_ARG __global Dtype * biases_base, #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#else
#define BIAS_KERNEL_ARG
#endif #endif
#define KERNEL_ARG_DTYPE float
#define TYPE_FLOAT 1
#define TYPE_HALF 2
#if defined(FUSED_CONV_RELU) #if defined(FUSED_CONV_RELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
#define FUSED_ARG Dtype negative_slope, #define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
#elif defined(FUSED_CONV_PRELU) #elif defined(FUSED_CONV_PRELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c]))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
#define FUSED_ARG __global const Dtype *negative_slope, #define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
#elif defined(FUSED_CONV_POWER) #elif defined(FUSED_CONV_POWER)
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power) #define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
#define FUSED_ARG Dtype power, #define FUSED_ARG KERNEL_ARG_DTYPE power,
#elif defined(FUSED_CONV_TANH) #elif defined(FUSED_CONV_TANH)
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x) #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
#define FUSED_ARG #define FUSED_ARG
#elif defined(FUSED_CONV_RELU6) #elif defined(FUSED_CONV_RELU6)
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value)) #define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
#define FUSED_ARG Dtype min_value, Dtype max_value, #define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
#else #else
#define ACTIVATION_RELU_FUNCTION(x, c) (x) #define ACTIVATION_RELU_FUNCTION(x, c) (x)
#define FUSED_ARG #define FUSED_ARG
@ -74,6 +76,11 @@
#define ELTWISE_DATA_ARG #define ELTWISE_DATA_ARG
#endif #endif
#if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
#else
#define BIAS_KERNEL_ARG
#endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y) #define CAT(x, y) __CAT(x, y)
@ -97,6 +104,16 @@
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#if defined(convolve_simd) || defined(Conv_Interleaved) #if defined(convolve_simd) || defined(Conv_Interleaved)
#if TYPE == TYPE_HALF
#define INT_TYPE ushort
#define INT_TYPE2 ushort2
#define INT_TYPE4 ushort4
#define INT_TYPE8 ushort8
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
#else
#define INT_TYPE uint #define INT_TYPE uint
#define INT_TYPE2 uint2 #define INT_TYPE2 uint2
#define INT_TYPE4 uint4 #define INT_TYPE4 uint4
@ -106,6 +123,7 @@
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8 #define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read #define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
#endif #endif
#endif
#ifdef KERNEL_BASIC #ifdef KERNEL_BASIC
@ -418,6 +436,25 @@ typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
typedef struct float0 { float s0; } float0; //never used but makes compiler happy. typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
typedef struct half1 { half s0; } half1;
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; } half10;
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; } half11;
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; } half12;
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
#define OUT_PITCH_X output_width #define OUT_PITCH_X output_width
#define ROW_PITCH input_width #define ROW_PITCH input_width

@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void op_sum4(__global const Dtype * A, __kernel void op_sum4(__global const Dtype * A,
__global const Dtype * B, __global const Dtype * B,
@ -73,20 +73,20 @@ __kernel void op_sum4(__global const Dtype * A,
a2 = vload4(i, src0_read + 2 * A_col_size); a2 = vload4(i, src0_read + 2 * A_col_size);
a3 = vload4(i, src0_read + 3 * A_col_size); a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = a0 * coeff1 + b0 * coeff2; dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
dot1 = a1 * coeff1 + b1 * coeff2; dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
dot2 = a2 * coeff1 + b2 * coeff2; dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
dot3 = a3 * coeff1 + b3 * coeff2; dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
#else #else
a0 = vload4(i, dst0_read); a0 = vload4(i, dst0_read);
a1 = vload4(i, dst0_read + A_col_size); a1 = vload4(i, dst0_read + A_col_size);
a2 = vload4(i, dst0_read + 2 * A_col_size); a2 = vload4(i, dst0_read + 2 * A_col_size);
a3 = vload4(i, dst0_read + 3 * A_col_size); a3 = vload4(i, dst0_read + 3 * A_col_size);
dot0 = a0 + b0 * coeff2; dot0 = a0 + b0 * (Dtype4)coeff2;
dot1 = a1 + b1 * coeff2; dot1 = a1 + b1 * (Dtype4)coeff2;
dot2 = a2 + b2 * coeff2; dot2 = a2 + b2 * (Dtype4)coeff2;
dot3 = a3 + b3 * coeff2; dot3 = a3 + b3 * (Dtype4)coeff2;
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);

File diff suppressed because it is too large Load Diff

@ -39,24 +39,42 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
// Types used for parameters, offset computations and so on #define KERNEL_ARG_DTYPE float
#define int_tp int #define TYPE_FLOAT 1
#define uint_tp unsigned int #define TYPE_HALF 2
#if TYPE == TYPE_HALF
#define Dtype half
#define Dtype2 half2
#define Dtype4 half4
#define Dtype8 half8
#define Dtype16 half16
#define as_Dtype as_half
#define as_Dtype2 as_half2
#define as_Dtype4 as_half4
#define as_Dtype8 as_half8
#define as_Dtype16 as_half16
#else
#define Dtype float #define Dtype float
#define Dtype2 float2 #define Dtype2 float2
#define Dtype4 float4 #define Dtype4 float4
#define Dtype8 float8 #define Dtype8 float8
#define Dtype16 float16
#define as_Dtype as_float #define as_Dtype as_float
#define as_Dtype2 as_float2 #define as_Dtype2 as_float2
#define as_Dtype4 as_float4 #define as_Dtype4 as_float4
#define as_Dtype8 as_float8 #define as_Dtype8 as_float8
#define as_Dtype16 as_float16
#define KERNEL_ARG_DTYPE float #endif
#if defined(cl_intel_subgroups) #if defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable #pragma OPENCL EXTENSION cl_intel_subgroups : enable
@ -67,6 +85,15 @@
// common block to calculate (alpha * AxB + beta * C) and output to destination image. // common block to calculate (alpha * AxB + beta * C) and output to destination image.
#if TYPE == TYPE_HALF
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
#define SHUFFLE_TYPE2(val) as_ushort2(val)
#define SHUFFLE_TYPE8(val) as_ushort8(val)
#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
#define SIZE_OF_ELEMENT sizeof(ushort)
#define SIMD_SIZE_GEMM 16
#define TILE_N 16
#else
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord ) #define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
#define SHUFFLE_TYPE2(val) val #define SHUFFLE_TYPE2(val) val
#define SHUFFLE_TYPE8(val) val #define SHUFFLE_TYPE8(val) val
@ -74,11 +101,17 @@
#define SIZE_OF_ELEMENT sizeof(uint) #define SIZE_OF_ELEMENT sizeof(uint)
#define SIMD_SIZE_GEMM 8 #define SIMD_SIZE_GEMM 8
#define TILE_N 8 #define TILE_N 8
#endif
//#define USE_IMAGE_C //#define USE_IMAGE_C
#ifdef USE_IMAGE_C #ifdef USE_IMAGE_C
#if TYPE == TYPE_HALF
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
#else
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) ) #define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) ) #define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
#endif
#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst #define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint)) #define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
#else #else
@ -139,10 +172,10 @@
blockC03 += blockAxB03; \ blockC03 += blockAxB03; \
} \ } \
} else { \ } else { \
blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \ blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \ blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
if (!ALPHA1) { \ if (!ALPHA1) { \
blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \ blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \ blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
@ -172,6 +205,43 @@
intel_sub_group_shuffle( _block.s7, _col ) ); intel_sub_group_shuffle( _block.s7, _col ) );
// A's column block multiply B 's row block. // A's column block multiply B 's row block.
#if TYPE == TYPE_HALF
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
_result = mad( (Dtype8)(_blockB00.s0), acol0, _result ); \
_result = mad( (Dtype8)(_blockB00.s1), acol1, _result ); \
_result = mad( (Dtype8)(_blockB00.s2), acol2, _result ); \
_result = mad( (Dtype8)(_blockB00.s3), acol3, _result ); \
_result = mad( (Dtype8)(_blockB00.s4), acol4, _result ); \
_result = mad( (Dtype8)(_blockB00.s5), acol5, _result ); \
_result = mad( (Dtype8)(_blockB00.s6), acol6, _result ); \
_result = mad( (Dtype8)(_blockB00.s7), acol7, _result ); \
_result = mad( (Dtype8)(_blockB01.s0), acol8, _result ); \
_result = mad( (Dtype8)(_blockB01.s1), acol9, _result ); \
_result = mad( (Dtype8)(_blockB01.s2), acola, _result ); \
_result = mad( (Dtype8)(_blockB01.s3), acolb, _result ); \
_result = mad( (Dtype8)(_blockB01.s4), acolc, _result ); \
_result = mad( (Dtype8)(_blockB01.s5), acold, _result ); \
_result = mad( (Dtype8)(_blockB01.s6), acole, _result ); \
_result = mad( (Dtype8)(_blockB01.s7), acolf, _result ); \
}
#else
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \ { \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
@ -191,7 +261,50 @@
_result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \ _result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \
_result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \ _result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \
} }
#endif
#if TYPE == TYPE_HALF
#define GEMM_NN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( 0, group_y * TILE_M ); \
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
do \
{ \
int2 coordBTemp = coordB; \
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
Dtype8 blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
} \
while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_NN(ALPHA1, BETA_NOT0) \ #define GEMM_NN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
while( coordB.y < width0 ); \ while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
GEMM_NN(1, 0) // ALPHA == 1, BETA == 0 GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
GEMM_NN(1, 1) // ALPHA == 1, BETA != 0 GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \ _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \
} }
#if TYPE == TYPE_HALF
#define GEMM_TN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0);\
const int group_y = get_group_id(1);\
Dtype8 blockAxB00 = 0;\
Dtype8 blockAxB01 = 0;\
Dtype8 blockAxB02 = 0;\
Dtype8 blockAxB03 = 0;\
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
do\
{\
int2 coordBTemp = coordB;\
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
int2 coordATemp = coordA;\
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
} \
while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_TN(ALPHA1, BETA_NOT0) \ #define GEMM_TN(ALPHA1, BETA_NOT0) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
while( coordB.y < width0 ); \ while( coordB.y < width0 ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
GEMM_TN(1, 0) // ALPHA == 1, BETA == 0 GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
GEMM_TN(1, 1) // ALPHA == 1, BETA != 0 GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
@ -324,6 +478,7 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
intel_sub_group_shuffle( _block.s6, _col), \ intel_sub_group_shuffle( _block.s6, _col), \
intel_sub_group_shuffle( _block.s7, _col) ) intel_sub_group_shuffle( _block.s7, _col) )
#if TYPE == TYPE_HALF
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \ #define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \ { \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
@ -334,6 +489,14 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \ const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \ const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \ const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \ _result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \ _result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \ _result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
@ -342,8 +505,80 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \ _result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \ _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
_result = mad( (Dtype8)_blockB.s8, acol8, _result ); \
_result = mad( (Dtype8)_blockB.s9, acol9, _result ); \
_result = mad( (Dtype8)_blockB.sa, acola, _result ); \
_result = mad( (Dtype8)_blockB.sb, acolb, _result ); \
_result = mad( (Dtype8)_blockB.sc, acolc, _result ); \
_result = mad( (Dtype8)_blockB.sd, acold, _result ); \
_result = mad( (Dtype8)_blockB.se, acole, _result ); \
_result = mad( (Dtype8)_blockB.sf, acolf, _result ); \
} }
#else
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
}
#endif
#if TYPE == TYPE_HALF
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( 0, group_y * TILE_M ); \
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
do \
{ \
Dtype16 blockB00; \
BLOCKB_READ8(blockB00, B, coordB); \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
} \
while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
}
#else
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ #define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
while( coordB.x < padded_k / VECSIZE ); \ while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
} }
#endif
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2; _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
#endif
#define MATB_PARAMETER __read_only image2d_t B #define MATB_PARAMETER __read_only image2d_t B
@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
_coordB.x += TILE_K * 2;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
_blockb = vload8(0, B_read); \ _blockb = vload8(0, B_read); \
_coordB.x += TILE_K; _coordB.x += TILE_K;
#endif
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
Dtype4 temp; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s0 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s1 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s2 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s3 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s4 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s5 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s6 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s7 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s8 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s9 = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sa = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sb = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sc = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sd = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.se = temp.s0; \
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.sf = temp.s0; \
_coordB.x += 16;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
_blockb.s7 = temp.s0; \ _blockb.s7 = temp.s0; \
_coordB.x += 8; _coordB.x += 8;
#endif
#define MATB_PARAMETER __read_only image2d_t B #define MATB_PARAMETER __read_only image2d_t B
@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
} }
#if TYPE == TYPE_HALF
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock) \
{ \
const Dtype alpha = (Dtype)alpha_in; \
const Dtype beta = (Dtype)beta_in; \
const int group_x = get_group_id(0); \
const int group_y = get_group_id(1); \
Dtype8 blockAxB00 = 0; \
Dtype8 blockAxB01 = 0; \
Dtype8 blockAxB02 = 0; \
Dtype8 blockAxB03 = 0; \
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
do \
{ \
Dtype8 blockB00; \
BLOCKB_READ8(blockB00, B, coordB); \
int2 coordATemp = coordA; \
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
} \
while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
}
#else
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \ #define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \ __attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
while( coordB.x < padded_k / VECSIZE ); \ while( coordB.x < padded_k / VECSIZE ); \
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\ GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
} }
#endif
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
#undef BLOCKB_READ8 #undef BLOCKB_READ8
#undef MATB_PARAMETER #undef MATB_PARAMETER
#if TYPE == TYPE_HALF
#define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
_coordB.x += TILE_K;
#else
#define BLOCKB_READ8(_blockb, _B, _coordB) \ #define BLOCKB_READ8(_blockb, _B, _coordB) \
int2 _coordBTemp = _coordB; \ int2 _coordBTemp = _coordB; \
_coordBTemp.y += get_local_id(0); \ _coordBTemp.y += get_local_id(0); \
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
_blockb = vload8(0, B_read); \ _blockb = vload8(0, B_read); \
_coordB.x += TILE_K; _coordB.x += TILE_K;
#endif
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb #define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -598,7 +944,7 @@ GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
#undef READ_IMAGE #undef READ_IMAGE
#undef SIZE_OF_ELEMENT #undef SIZE_OF_ELEMENT
__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)( __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
__global Dtype* A, __global Dtype* A,
__write_only image2d_t ImA, __write_only image2d_t ImA,
int offA, int offA,
@ -611,10 +957,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx]; Dtype srcA = A_off[gidy * ldA + gidx];
#if TYPE == TYPE_HALF
write_imageh(ImA, coord_dst, (Dtype4)srcA);
#else
write_imagef(ImA, coord_dst, (Dtype4)srcA); write_imagef(ImA, coord_dst, (Dtype4)srcA);
#endif
} }
__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)( __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
__global Dtype* A, __global Dtype* A,
__write_only image2d_t ImA, __write_only image2d_t ImA,
int offA, int offA,
@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) {
write_imageh(ImA, coord_dst, 0);
return;
}
__global Dtype* A_off = A + offA;
write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
#else
if (gidx >= width || gidy >= height) { if (gidx >= width || gidy >= height) {
write_imageui(ImA, coord_dst, (uint4)0); write_imageui(ImA, coord_dst, (uint4)0);
return; return;
@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx])); uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
write_imageui(ImA, coord_dst, srcA); write_imageui(ImA, coord_dst, srcA);
#endif
} }

@ -40,16 +40,20 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x, __kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
const int offx, __global Dtype* y, const int offx, __global Dtype* y,
const int offy) { const int offy) {
for (int index = get_global_id(0); index < n; index += get_global_size(0)) { for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
Dtype src = x[offx + index]; Dtype src = x[offx + index];
Dtype dst = y[offy + index]; Dtype dst = y[offy + index];
y[offy + index] = alpha * src + dst; y[offy + index] = convert_Dtype(alpha) * src + dst;
} }
} }

@ -39,41 +39,45 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
__kernel void TEMPLATE(matvec_mul4,Dtype)( __kernel void TEMPLATE(matvec_mul4,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float4 * result, __global Dtype4* result,
int offr, int offr,
__local float4 * work) __local Dtype4* work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + row_gid * 4 * A_col_size + offA; const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
const __global float *src1_read = v + offv; const __global Dtype *src1_read = v + offv;
result = (__global float4*)((__global float*)result + offr); result = (__global Dtype4*)((__global Dtype*)result + offr);
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
float4 dot1 = (float4)(0.f); Dtype4 dot1 = (Dtype4)(0.f);
float4 dot2 = (float4)(0.f); Dtype4 dot2 = (Dtype4)(0.f);
float4 dot3 = (float4)(0.f); Dtype4 dot3 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) { while( i < A_col_size / 4) {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 a1 = vload4(i, src0_read + A_col_size); const Dtype4 a1 = vload4(i, src0_read + A_col_size);
const float4 a2 = vload4(i, src0_read + 2 * A_col_size); const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
const float4 a3 = vload4(i, src0_read + 3 * A_col_size); const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
dot1 += a1 * b0; dot1 += a1 * b0;
@ -92,15 +96,15 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float at1 = src0_trail[i + A_col_size]; const Dtype at1 = src0_trail[i + A_col_size];
const float at2 = src0_trail[i + 2 * A_col_size]; const Dtype at2 = src0_trail[i + 2 * A_col_size];
const float at3 = src0_trail[i + 3 * A_col_size]; const Dtype at3 = src0_trail[i + 3 * A_col_size];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid].s0 += at0 * bt; work[lid].s0 += at0 * bt;
work[lid].s1 += at1 * bt; work[lid].s1 += at1 * bt;
@ -118,40 +122,40 @@ __kernel void TEMPLATE(matvec_mul4,Dtype)(
} }
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) if(beta == (Dtype)0)
result[row_gid] = alpha * work[0]; result[row_gid] = convert_Dtype(alpha) * work[0];
else else
result[row_gid] = alpha * work[0] + beta * result[row_gid]; result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
} }
} }
/* This kernel used for the trailing rows when row_of_A %4 !=0 */ /* This kernel used for the trailing rows when row_of_A %4 !=0 */
__kernel void TEMPLATE(matvec_mul1,Dtype)( __kernel void TEMPLATE(matvec_mul1,Dtype)(
__global const float * A, __global const Dtype * A,
int offA, int offA,
unsigned int A_col_size, unsigned int A_col_size,
unsigned int row_offset, unsigned int row_offset,
unsigned int trail_item, unsigned int trail_item,
__global const float * v, __global const Dtype * v,
int offv, int offv,
float alpha, KERNEL_ARG_DTYPE alpha,
float beta, KERNEL_ARG_DTYPE beta,
__global float * result, __global Dtype * result,
int offr, int offr,
__local float * work) __local Dtype * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA; const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
const __global float *src1_read = v + + offv; const __global Dtype *src1_read = v + + offv;
result = result + offr; result = result + offr;
float4 dot0 = (float4)(0.f); Dtype4 dot0 = (Dtype4)(0.f);
unsigned int i = lid; unsigned int i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const float4 a0 = vload4(i, src0_read); const Dtype4 a0 = vload4(i, src0_read);
const float4 b0 = vload4(i, src1_read); const Dtype4 b0 = vload4(i, src1_read);
dot0 += a0 * b0; dot0 += a0 * b0;
i += get_local_size(0); i += get_local_size(0);
@ -163,11 +167,11 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
{ {
if(trail_item != 0) if(trail_item != 0)
{ {
const __global float *src0_trail = src0_read + i * 4; const __global Dtype *src0_trail = src0_read + i * 4;
const __global float *src1_trail = src1_read + i * 4; const __global Dtype *src1_trail = src1_read + i * 4;
for(unsigned int i = 0; i < trail_item; ++i) { for(unsigned int i = 0; i < trail_item; ++i) {
const float at0 = src0_trail[i]; const Dtype at0 = src0_trail[i];
const float bt = src1_trail[i]; const Dtype bt = src1_trail[i];
work[lid] += at0 * bt; work[lid] += at0 * bt;
} }
@ -182,10 +186,10 @@ __kernel void TEMPLATE(matvec_mul1,Dtype)(
if(lid == 0) { if(lid == 0) {
if(beta == (Dtype)0) { if(beta == (Dtype)0) {
result[row_gid+row_offset] = alpha * work[0]; result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
} else { } else {
result[row_gid+row_offset] *= beta; result[row_gid+row_offset] *= convert_Dtype(beta);
result[row_gid+row_offset] += alpha * work[0]; result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
} }
} }
} }

@ -40,7 +40,11 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define Dtype float
#define Dtype4 float4 #define Dtype4 float4
#define Dtype8 float8 #define Dtype8 float8
@ -135,17 +139,17 @@ __kernel void MVN(__global const Dtype* src,
store(dst_vec, dst, index); store(dst_vec, dst, index);
} }
__kernel void MEAN_FUSE(__global const Dtype * A, __kernel void MEAN_FUSE(__global const T * A,
unsigned int A_col_size, unsigned int A_col_size,
float alpha, float alpha,
__global Dtype4 * result, __global T4 * mean,
__global Dtype * B, __global Dtype * tmp,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; const __global T *src0_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@ -153,15 +157,15 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
const Dtype4 b0 = (Dtype4)1.f; const Dtype4 b0 = (Dtype4)1.f;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 += a0; dot0 += convert_float4(a0);
dot1 += a1; dot1 += convert_float4(a1);
dot2 += a2; dot2 += convert_float4(a2);
dot3 += a3; dot3 += convert_float4(a3);
i += get_local_size(0); i += get_local_size(0);
} }
@ -181,22 +185,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
if(lid == 0) if(lid == 0)
{ {
result[row_gid] = alpha * work[0]; mean[row_gid] = convert_T(alpha * work[0]);
} }
Dtype4 sum = work[0] * alpha; Dtype4 sum = work[0] * alpha;
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src0_read); const T4 a0 = vload4(i, src0_read);
const Dtype4 a1 = vload4(i, src0_read + A_col_size); const T4 a1 = vload4(i, src0_read + A_col_size);
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
dot0 = native_powr(a0 - (Dtype4)sum.x, 2); dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2);
dot1 = native_powr(a1 - (Dtype4)sum.y, 2); dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2);
dot2 = native_powr(a2 - (Dtype4)sum.z, 2); dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2);
dot3 = native_powr(a3 - (Dtype4)sum.w, 2); dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2);
vstore4(dot0, i, dst0_read); vstore4(dot0, i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(dot1, i, dst0_read + A_col_size);
@ -208,22 +212,22 @@ __kernel void MEAN_FUSE(__global const Dtype * A,
} }
__kernel void MVN_FUSE(__global const Dtype * tmp, __kernel void MVN_FUSE(__global const Dtype * tmp,
__global const Dtype * A, __global const T * A,
__global const Dtype4 * mean, __global const T4 * mean,
unsigned int A_col_size, unsigned int A_col_size,
const float alpha_val, const float alpha_val,
const float eps, const float eps,
const float relu_slope, const float relu_slope,
__global const Dtype4 * bnorm_weight, __global const Dtype4 * bnorm_weight,
__global const Dtype4 * bnorm_bias, __global const Dtype4 * bnorm_bias,
__global Dtype * B, __global T * B,
__local Dtype4 * work) __local Dtype4 * work)
{ {
unsigned int row_gid = get_group_id(0); unsigned int row_gid = get_group_id(0);
unsigned int lid = get_local_id(0); unsigned int lid = get_local_id(0);
const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size; const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
const __global Dtype *src1_read = A + row_gid * 4 * A_col_size; const __global T *src1_read = A + row_gid * 4 * A_col_size;
__global Dtype *dst0_read = B + row_gid * 4 * A_col_size; __global T *dst0_read = B + row_gid * 4 * A_col_size;
Dtype4 dot0, dot1, dot2, dot3; Dtype4 dot0, dot1, dot2, dot3;
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
@ -257,7 +261,7 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
Dtype4 mean_val = mean[row_gid]; Dtype4 mean_val = convert_float4(mean[row_gid]);
Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps; Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
Dtype4 alpha = (Dtype4)1.f / dev_val; Dtype4 alpha = (Dtype4)1.f / dev_val;
@ -271,15 +275,15 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
i = lid; i = lid;
while( i < A_col_size / 4) while( i < A_col_size / 4)
{ {
const Dtype4 a0 = vload4(i, src1_read); const T4 a0 = vload4(i, src1_read);
const Dtype4 a1 = vload4(i, src1_read + A_col_size); const T4 a1 = vload4(i, src1_read + A_col_size);
const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size); const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size); const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x; dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y; dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z; dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w; dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
dot0 = dot0 * w.x + (Dtype4)b.x; dot0 = dot0 * w.x + (Dtype4)b.x;
dot1 = dot1 * w.y + (Dtype4)b.y; dot1 = dot1 * w.y + (Dtype4)b.y;
@ -300,10 +304,10 @@ __kernel void MVN_FUSE(__global const Dtype * tmp,
dot3 = select(new3, dot3, dot3 > (Dtype4)0.f); dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
#endif #endif
vstore4(dot0, i, dst0_read); vstore4(convert_T(dot0), i, dst0_read);
vstore4(dot1, i, dst0_read + A_col_size); vstore4(convert_T(dot1), i, dst0_read + A_col_size);
vstore4(dot2, i, dst0_read + 2 * A_col_size); vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
vstore4(dot3, i, dst0_read + 3 * A_col_size); vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
i += get_local_size(0); i += get_local_size(0);
} }

@ -42,14 +42,18 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define KERNEL_ARG_DTYPE float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in, __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
const int num, const int channels, const int num, const int channels,
const int height, const int width, const int size, const int height, const int width, const int size,
const Dtype alpha_over_size, const Dtype k, const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
__global Dtype* const out, __global Dtype* const out,
const Dtype negative_beta) { const KERNEL_ARG_DTYPE negative_beta) {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0)) { index += get_global_size(0)) {
// find out the local offset // find out the local offset
@ -60,11 +64,11 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
const int step = height * width; const int step = height * width;
__global const Dtype* in_off = in + offset; __global const Dtype* in_off = in + offset;
__global Dtype* out_off = out + offset; __global Dtype* out_off = out + offset;
Dtype scale_val; KERNEL_ARG_DTYPE scale_val;
int head = 0; int head = 0;
const int pre_pad = (size - 1) / 2; const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1; const int post_pad = size - pre_pad - 1;
Dtype accum_scale = 0; KERNEL_ARG_DTYPE accum_scale = 0;
// fill the scale at [n, :, h, w] // fill the scale at [n, :, h, w]
// accumulate values // accumulate values
while (head < post_pad && head < channels) { while (head < post_pad && head < channels) {
@ -79,7 +83,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
// subtract only // subtract only
@ -89,7 +93,7 @@ __kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global con
* in_off[(head - size) * step]; * in_off[(head - size) * step];
} }
scale_val = k + accum_scale * alpha_over_size; scale_val = k + accum_scale * alpha_over_size;
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta); out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
++head; ++head;
} }
} }

@ -42,7 +42,10 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#if defined KERNEL_MAX_POOL #if defined KERNEL_MAX_POOL

@ -40,7 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void permute(const int nthreads, __kernel void permute(const int nthreads,
__global Dtype* bottom_data, __global Dtype* bottom_data,

@ -39,17 +39,18 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void prior_box(const int nthreads, __kernel void prior_box(const int nthreads,
const Dtype stepX, const float stepX,
const Dtype stepY, const float stepY,
__global const Dtype* _offsetsX, __global const float* _offsetsX,
__global const Dtype* _offsetsY, __global const float* _offsetsY,
const int offsetsX_size, const int offsetsX_size,
__global const Dtype* _widths, __global const float* _widths,
__global const Dtype* _heights, __global const float* _heights,
const int widths_size, const int widths_size,
__global Dtype* dst, __global Dtype* dst,
const int _layerHeight, const int _layerHeight,
@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads,
outputPtr = dst + index * 4 * offsetsX_size * widths_size; outputPtr = dst + index * 4 * offsetsX_size * widths_size;
Dtype _boxWidth, _boxHeight; float _boxWidth, _boxHeight;
Dtype4 vec; Dtype4 vec;
for (int i = 0; i < widths_size; ++i) for (int i = 0; i < widths_size; ++i)
{ {
@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads,
_boxHeight = _heights[i]; _boxHeight = _heights[i];
for (int j = 0; j < offsetsX_size; ++j) for (int j = 0; j < offsetsX_size; ++j)
{ {
float center_x = (w + _offsetsX[j]) * stepX; Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
float center_y = (h + _offsetsY[j]) * stepY; Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads,
__kernel void set_variance(const int nthreads, __kernel void set_variance(const int nthreads,
const int offset, const int offset,
const int variance_size, const int variance_size,
__global const Dtype* variance, __global const float* variance,
__global Dtype* dst) __global Dtype* dst)
{ {
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads,
if (variance_size == 1) if (variance_size == 1)
var_vec = (Dtype4)(variance[0]); var_vec = (Dtype4)(variance[0]);
else else
var_vec = vload4(0, variance); var_vec = convert_T(vload4(0, variance));
vstore4(var_vec, 0, dst + offset + index * 4); vstore4(var_vec, 0, dst + offset + index * 4);
} }

@ -39,6 +39,10 @@
// //
//M*/ //M*/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void reorg(const int count, __kernel void reorg(const int count,
__global const Dtype* src, __global const Dtype* src,
const int channels, const int channels,

@ -40,9 +40,9 @@
// //
//M*/ //M*/
#define Dtype float #if defined(cl_khr_fp16)
#define Dtype4 float4 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define Dtype8 float8 #endif
__kernel void slice(__global const Dtype* src, __kernel void slice(__global const Dtype* src,
const int src_plane_size, const int src_plane_size,

@ -24,6 +24,10 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/ **************************************************************************************/
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void kernel_channel_max(const int num, const int channels, __kernel void kernel_channel_max(const int num, const int channels,
const int spatial_dim, __global const T* data, __global T* out) { const int spatial_dim, __global const T* data, __global T* out) {
int index = get_global_id(0); int index = get_global_id(0);
@ -40,12 +44,12 @@ __kernel void kernel_channel_max(const int num, const int channels,
__kernel void kernel_channel_subtract(const int count, __kernel void kernel_channel_subtract(const int count,
const int num, const int channels, const int num, const int channels,
const int spatial_dim, __global const T* channel_max, __global T* data) { const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
int index = get_global_id(0); int index = get_global_id(0);
if(index < count) { if(index < count) {
int n = index / channels / spatial_dim; int n = index / channels / spatial_dim;
int s = index % spatial_dim; int s = index % spatial_dim;
data[index] -= channel_max[n * spatial_dim + s]; data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
} }
} }

@ -42,12 +42,15 @@
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
#if defined(cl_intel_subgroups) #if defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable #pragma OPENCL EXTENSION cl_intel_subgroups : enable
#endif #endif
#if defined(cl_khr_fp16)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels, __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
const int spatial_dim, const int spatial_dim,
__global Dtype* scale, __global Dtype* scale,
@ -60,12 +63,12 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int n = get_global_id(1); int n = get_global_id(1);
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
@ -77,7 +80,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = maxval / 100000; scale_tmp[s] = maxval;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -95,7 +98,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out_tmp[c * spatial_dim + s]; sum += out_tmp[c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -105,7 +108,7 @@ __kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int chann
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale_tmp[s] = sum / 100000; scale_tmp[s] = sum;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -130,12 +133,12 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim; __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index += for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
get_global_size(0), ++s) { get_global_size(0), ++s) {
float maxval = -FLT_MAX; Dtype maxval = -DTYPE_MAX;
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
Dtype tmp = data[(n * channels + c) * spatial_dim + s]; Dtype tmp = data[(n * channels + c) * spatial_dim + s];
maxval = max((Dtype)tmp, (Dtype)maxval); maxval = max((Dtype)tmp, (Dtype)maxval);
} }
maxval = sub_group_reduce_max(maxval * 100000); maxval = sub_group_reduce_max(maxval);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval; group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
} }
@ -146,7 +149,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = maxval / 100000; scale[n * spatial_dim + s] = maxval;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
@ -164,7 +167,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) { for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
sum += out[n * channels * spatial_dim + c * spatial_dim + s]; sum += out[n * channels * spatial_dim + c * spatial_dim + s];
} }
sum = sub_group_reduce_add(sum * 100000); sum = sub_group_reduce_add(sum);
group_tmp[get_sub_group_id() * spatial_dim + s] = sum; group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
@ -174,7 +177,7 @@ __kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
int s = index / get_max_sub_group_size(); int s = index / get_max_sub_group_size();
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]); Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
//if (get_sub_group_local_id() == 0) //if (get_sub_group_local_id() == 0)
scale[n * spatial_dim + s] = sum / 100000; scale[n * spatial_dim + s] = sum;
} }
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);

@ -64,6 +64,7 @@
namespace cv { namespace dnn { namespace cv { namespace dnn {
CV__DNN_EXPERIMENTAL_NS_BEGIN CV__DNN_EXPERIMENTAL_NS_BEGIN
#define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
Mutex& getInitializationMutex(); Mutex& getInitializationMutex();
void initializeLayerFactory(); void initializeLayerFactory();
CV__DNN_EXPERIMENTAL_NS_END CV__DNN_EXPERIMENTAL_NS_END

@ -147,7 +147,9 @@ TEST_P(DNNTestNetwork, Inception_5h)
TEST_P(DNNTestNetwork, ENet) TEST_P(DNNTestNetwork, ENet)
{ {
if (backend == DNN_BACKEND_INFERENCE_ENGINE) throw SkipTestException(""); if ((backend == DNN_BACKEND_INFERENCE_ENGINE) ||
(backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution", processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" : target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
"dnn/halide_scheduler_enet.yml", "dnn/halide_scheduler_enet.yml",
@ -161,9 +163,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.0007 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.011 : 0.0;
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow) TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
@ -173,15 +177,17 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.06 : 0.0;
processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt", processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, SSD_VGG16) TEST_P(DNNTestNetwork, SSD_VGG16)
{ {
if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL || if ((backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU || (backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU) ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU))
throw SkipTestException(""); throw SkipTestException("");
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
"dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out"); "dnn/ssd_vgg16.prototxt", Size(300, 300), "detection_out");
@ -236,14 +242,17 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
throw SkipTestException(""); throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/street.png", false)); Mat sample = imread(findDataFile("dnn/street.png", false));
Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
float l1 = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.008 : 0.0;
float lInf = (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ? 0.07 : 0.0;
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt", processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
inp, "detection_out"); inp, "detection_out", "", l1, lInf);
} }
TEST_P(DNNTestNetwork, DenseNet_121) TEST_P(DNNTestNetwork, DenseNet_121)
{ {
if (backend == DNN_BACKEND_HALIDE || if ((backend == DNN_BACKEND_HALIDE) ||
backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16) (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL_FP16) ||
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException(""); throw SkipTestException("");
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe"); processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
} }
@ -258,7 +267,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
#endif #endif
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL) tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL),
tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL_FP16)
}; };
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases)); INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases));

@ -104,7 +104,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
ASSERT_FALSE(net.empty()); ASSERT_FALSE(net.empty());
} }
net.setPreferableTarget(get<1>(GetParam())); int targetId = get<1>(GetParam());
const float l1 = 1e-5;
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-3 : 1e-4;
net.setPreferableTarget(targetId);
Mat sample = imread(_tf("grace_hopper_227.png")); Mat sample = imread(_tf("grace_hopper_227.png"));
ASSERT_TRUE(!sample.empty()); ASSERT_TRUE(!sample.empty());
@ -112,10 +116,11 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data"); net.setInput(blobFromImage(sample, 1.0f, Size(227, 227), Scalar(), false), "data");
Mat out = net.forward("prob"); Mat out = net.forward("prob");
Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy")); Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
normAssert(ref, out); normAssert(ref, out, "", l1, lInf);
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(), availableDnnTargets())); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_AlexNet, Combine(testing::Bool(),
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)));
#if !defined(_WIN32) || defined(_WIN64) #if !defined(_WIN32) || defined(_WIN64)
TEST(Reproducibility_FCN, Accuracy) TEST(Reproducibility_FCN, Accuracy)
@ -176,8 +181,11 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false); const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false); const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
Net net = readNetFromCaffe(proto, model); Net net = readNetFromCaffe(proto, model);
int targetId = GetParam();
const float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 1.5e-4 : 1e-5;
const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-4;
net.setPreferableTarget(GetParam()); net.setPreferableTarget(targetId);
Mat sample = imread(_tf("street.png")); Mat sample = imread(_tf("street.png"));
@ -185,8 +193,10 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
net.setInput(inp); net.setInput(inp);
Mat out = net.forward(); Mat out = net.forward();
const float scores_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
const float boxes_iou_diff = (targetId == DNN_TARGET_OPENCL_FP16) ? 5e-3 : 1e-4;
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy")); Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
normAssertDetections(ref, out); normAssertDetections(ref, out, "", 0.0, scores_diff, boxes_iou_diff);
// Check that detections aren't preserved. // Check that detections aren't preserved.
inp.setTo(0.0f); inp.setTo(0.0f);
@ -212,10 +222,12 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
// a single sample in batch. The first numbers of detection vectors are batch id. // a single sample in batch. The first numbers of detection vectors are batch id.
outBatch = outBatch.reshape(1, outBatch.total() / 7); outBatch = outBatch.reshape(1, outBatch.total() / 7);
EXPECT_EQ(outBatch.rows, 2 * numDetections); EXPECT_EQ(outBatch.rows, 2 * numDetections);
normAssert(outBatch.rowRange(0, numDetections), ref); normAssert(outBatch.rowRange(0, numDetections), ref, "", l1, lInf);
normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7)); normAssert(outBatch.rowRange(numDetections, 2 * numDetections).colRange(1, 7), ref.colRange(1, 7),
"", l1, lInf);
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50; typedef testing::TestWithParam<DNNTarget> Reproducibility_ResNet50;
TEST_P(Reproducibility_ResNet50, Accuracy) TEST_P(Reproducibility_ResNet50, Accuracy)
@ -226,6 +238,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
int targetId = GetParam(); int targetId = GetParam();
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false); Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
ASSERT_TRUE(!input.empty()); ASSERT_TRUE(!input.empty());
@ -233,20 +248,21 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
Mat out = net.forward(); Mat out = net.forward();
Mat ref = blobFromNPY(_tf("resnet50_prob.npy")); Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
normAssert(ref, out); normAssert(ref, out, "", l1, lInf);
if (targetId == DNN_TARGET_OPENCL) if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
{ {
UMat out_umat; UMat out_umat;
net.forward(out_umat); net.forward(out_umat);
normAssert(ref, out_umat, "out_umat"); normAssert(ref, out_umat, "out_umat", l1, lInf);
std::vector<UMat> out_umats; std::vector<UMat> out_umats;
net.forward(out_umats); net.forward(out_umats);
normAssert(ref, out_umats[0], "out_umat_vector"); normAssert(ref, out_umats[0], "out_umat_vector", l1, lInf);
} }
} }
INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1; typedef testing::TestWithParam<DNNTarget> Reproducibility_SqueezeNet_v1_1;
TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy) TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)

@ -295,24 +295,30 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets()); INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_nets, availableDnnTargets());
TEST(Test_TensorFlow, defun) typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_fp16;
TEST_P(Test_TensorFlow_fp16, tests)
{ {
runTensorFlowNet("defun_dropout"); int targetId = GetParam();
const float l1 = 7e-4;
const float lInf = 1e-2;
runTensorFlowNet("fp16_single_conv", targetId, false, l1, lInf);
runTensorFlowNet("fp16_deconvolution", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_same", targetId, false, l1, lInf);
runTensorFlowNet("fp16_padding_valid", targetId, false, l1, lInf);
runTensorFlowNet("fp16_eltwise_add_mul", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_valid", targetId, false, l1, lInf);
runTensorFlowNet("fp16_pad_and_concat", targetId, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_even", targetId, false, l1, lInf);
runTensorFlowNet("fp16_padding_same", targetId, false, l1, lInf);
} }
TEST(Test_TensorFlow, fp16) INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_fp16,
Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16));
TEST(Test_TensorFlow, defun)
{ {
const float l1 = 1e-3; runTensorFlowNet("defun_dropout");
const float lInf = 1e-2;
runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf);
runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf);
} }
TEST(Test_TensorFlow, quantized) TEST(Test_TensorFlow, quantized)

Loading…
Cancel
Save