From 452fa3011ca7eff4d99d7ffaf738ca7012023fbb Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 10 Jul 2018 15:00:42 +0300 Subject: [PATCH 1/3] dnn(test): drop CV_ENUM for DNNBackend / DNNTarget --- modules/dnn/test/test_backends.cpp | 18 +++---- modules/dnn/test/test_caffe_importer.cpp | 10 ++-- modules/dnn/test/test_googlenet.cpp | 2 +- modules/dnn/test/test_halide_layers.cpp | 48 +++++++++---------- modules/dnn/test/test_precomp.hpp | 61 +++++++++++++++++------- modules/dnn/test/test_tf_importer.cpp | 2 +- modules/dnn/test/test_torch_importer.cpp | 4 +- 7 files changed, 85 insertions(+), 60 deletions(-) diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index ad7eb0917e..49e1a2a983 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -278,19 +278,19 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16) processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf); } -const tuple testCases[] = { +const tuple testCases[] = { #ifdef HAVE_HALIDE - tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU), - tuple(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU), + tuple(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL), #endif #ifdef HAVE_INF_ENGINE - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), #endif - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) }; INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, testing::ValuesIn(testCases)); diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp index c99e4e63f7..5365b2a435 100644 --- a/modules/dnn/test/test_caffe_importer.cpp +++ b/modules/dnn/test/test_caffe_importer.cpp @@ -82,7 +82,7 @@ TEST(Test_Caffe, read_googlenet) ASSERT_FALSE(net.empty()); } -typedef testing::TestWithParam > Reproducibility_AlexNet; +typedef testing::TestWithParam > Reproducibility_AlexNet; TEST_P(Reproducibility_AlexNet, Accuracy) { bool readFromMemory = get<0>(GetParam()); @@ -179,7 +179,7 @@ TEST(Reproducibility_SSD, Accuracy) normAssertDetections(ref, out); } -typedef testing::TestWithParam Reproducibility_MobileNet_SSD; +typedef testing::TestWithParam Reproducibility_MobileNet_SSD; TEST_P(Reproducibility_MobileNet_SSD, Accuracy) { const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false); @@ -234,7 +234,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy) INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_MobileNet_SSD, Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)); -typedef testing::TestWithParam Reproducibility_ResNet50; +typedef testing::TestWithParam Reproducibility_ResNet50; TEST_P(Reproducibility_ResNet50, Accuracy) { Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt", false), @@ -270,7 +270,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy) INSTANTIATE_TEST_CASE_P(/**/, Reproducibility_ResNet50, Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)); -typedef testing::TestWithParam Reproducibility_SqueezeNet_v1_1; +typedef testing::TestWithParam Reproducibility_SqueezeNet_v1_1; TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy) { Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt", false), @@ -413,7 +413,7 @@ TEST(Test_Caffe, multiple_inputs) normAssert(out, first_image + second_image); } -typedef testing::TestWithParam > opencv_face_detector; +typedef testing::TestWithParam > opencv_face_detector; TEST_P(opencv_face_detector, Accuracy) { std::string proto = findDataFile("dnn/opencv_face_detector.prototxt", false); diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp index a2ea731acc..37064c35c4 100644 --- a/modules/dnn/test/test_googlenet.cpp +++ b/modules/dnn/test/test_googlenet.cpp @@ -52,7 +52,7 @@ static std::string _tf(TString filename) return (getOpenCVExtraDir() + "/dnn/") + filename; } -typedef testing::TestWithParam Reproducibility_GoogLeNet; +typedef testing::TestWithParam Reproducibility_GoogLeNet; TEST_P(Reproducibility_GoogLeNet, Batching) { Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false), diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index b21ae85800..c61f7e378a 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -41,21 +41,21 @@ static void test(LayerParams& params, Mat& input, int backendId, int targetId) test(input, net, backendId, targetId); } -static testing::internal::ParamGenerator > dnnBackendsAndTargetsWithHalide() +static testing::internal::ParamGenerator > dnnBackendsAndTargetsWithHalide() { - static const tuple testCases[] = { + static const tuple testCases[] = { #ifdef HAVE_HALIDE - tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU), - tuple(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_HALIDE, DNN_TARGET_CPU), + tuple(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL), #endif #ifdef HAVE_INF_ENGINE - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), #endif - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) }; return testing::ValuesIn(testCases); } @@ -89,7 +89,7 @@ TEST_P(Test_Halide_layers, Padding) //////////////////////////////////////////////////////////////////////////////// // Convolution //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > Convolution; +typedef TestWithParam > > Convolution; TEST_P(Convolution, Accuracy) { int inChannels = get<0>(GetParam())[0]; @@ -154,7 +154,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine( //////////////////////////////////////////////////////////////////////////////// // Deconvolution //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > Deconvolution; +typedef TestWithParam > > Deconvolution; TEST_P(Deconvolution, Accuracy) { int inChannels = get<0>(GetParam())[0]; @@ -220,7 +220,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Deconvolution, Combine( //////////////////////////////////////////////////////////////////////////////// // LRN //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > LRN; +typedef TestWithParam > > LRN; TEST_P(LRN, Accuracy) { int inChannels = get<0>(GetParam())[0]; @@ -265,7 +265,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, LRN, Combine( //////////////////////////////////////////////////////////////////////////////// // Average pooling //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > AvePooling; +typedef TestWithParam > > AvePooling; TEST_P(AvePooling, Accuracy) { int inChannels = get<0>(GetParam()); @@ -305,7 +305,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, AvePooling, Combine( //////////////////////////////////////////////////////////////////////////////// // Maximum pooling //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > MaxPooling; +typedef TestWithParam > > MaxPooling; TEST_P(MaxPooling, Accuracy) { int inChannels = get<0>(GetParam()); @@ -344,7 +344,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine( //////////////////////////////////////////////////////////////////////////////// // Fully-connected //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > FullyConnected; +typedef TestWithParam > > FullyConnected; TEST_P(FullyConnected, Accuracy) { int inChannels = get<0>(GetParam()); @@ -387,7 +387,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine( //////////////////////////////////////////////////////////////////////////////// // SoftMax //////////////////////////////////////////////////////////////////////////////// -typedef TestWithParam > > SoftMax; +typedef TestWithParam > > SoftMax; TEST_P(SoftMax, Accuracy) { int inChannels = get<0>(GetParam()); @@ -476,7 +476,7 @@ void testInPlaceActivation(LayerParams& lp, int backendId, int targetId) test(input, net, backendId, targetId); } -typedef TestWithParam > > BatchNorm; +typedef TestWithParam > > BatchNorm; TEST_P(BatchNorm, Accuracy) { bool hasWeights = get<0>(GetParam()); @@ -511,7 +511,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, BatchNorm, Combine( dnnBackendsAndTargetsWithHalide() )); -typedef TestWithParam > > ReLU; +typedef TestWithParam > > ReLU; TEST_P(ReLU, Accuracy) { float negativeSlope = get<0>(GetParam()); @@ -530,7 +530,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine( dnnBackendsAndTargetsWithHalide() )); -typedef TestWithParam > > NoParamActivation; +typedef TestWithParam > > NoParamActivation; TEST_P(NoParamActivation, Accuracy) { int backendId = get<0>(get<1>(GetParam())); @@ -546,7 +546,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Combine( dnnBackendsAndTargetsWithHalide() )); -typedef TestWithParam > > Power; +typedef TestWithParam > > Power; TEST_P(Power, Accuracy) { float power = get<0>(GetParam())[0]; @@ -582,7 +582,7 @@ TEST_P(Test_Halide_layers, ChannelsPReLU) testInPlaceActivation(lp, backend, target); } -typedef TestWithParam > > Scale; +typedef TestWithParam > > Scale; TEST_P(Scale, Accuracy) { bool hasBias = get<0>(GetParam()); @@ -616,7 +616,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Scale, Combine( // `--- conv ----^ ^ ^ // `---- ... ------' ' // `-----------------' -typedef TestWithParam > > Concat; +typedef TestWithParam > > Concat; TEST_P(Concat, Accuracy) { Vec3i inSize = get<0>(GetParam()); @@ -682,7 +682,7 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Concat, Combine( // `--- conv ----^ ^ ^ // `---- ... ------' ' // `-----------------' -typedef TestWithParam > > Eltwise; +typedef TestWithParam > > Eltwise; TEST_P(Eltwise, Accuracy) { Vec3i inSize = get<0>(GetParam()); diff --git a/modules/dnn/test/test_precomp.hpp b/modules/dnn/test/test_precomp.hpp index 16f1e5c846..a0a46ab271 100644 --- a/modules/dnn/test/test_precomp.hpp +++ b/modules/dnn/test/test_precomp.hpp @@ -49,15 +49,41 @@ #include "opencv2/dnn.hpp" #include "test_common.hpp" -namespace opencv_test { namespace { -using namespace cv::dnn; +namespace cv { +namespace dnn { +CV__DNN_EXPERIMENTAL_NS_BEGIN + +static inline void PrintTo(const cv::dnn::Backend& v, std::ostream* os) +{ + switch (v) { + case DNN_BACKEND_DEFAULT: *os << "DNN_BACKEND_DEFAULT"; return; + case DNN_BACKEND_HALIDE: *os << "DNN_BACKEND_HALIDE"; return; + case DNN_BACKEND_INFERENCE_ENGINE: *os << "DNN_BACKEND_INFERENCE_ENGINE"; return; + case DNN_BACKEND_OPENCV: *os << "DNN_BACKEND_OPENCV"; return; + } // don't use "default:" to emit compiler warnings + *os << "DNN_BACKEND_UNKNOWN(" << v << ")"; +} + +static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os) +{ + switch (v) { + case DNN_TARGET_CPU: *os << "DNN_TARGET_CPU"; return; + case DNN_TARGET_OPENCL: *os << "DNN_TARGET_OPENCL"; return; + case DNN_TARGET_OPENCL_FP16: *os << "DNN_TARGET_OPENCL_FP16"; return; + case DNN_TARGET_MYRIAD: *os << "DNN_TARGET_MYRIAD"; return; + } // don't use "default:" to emit compiler warnings + *os << "DNN_TARGET_UNKNOWN(" << v << ")"; +} -CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE, DNN_BACKEND_OPENCV) -CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD) +CV__DNN_EXPERIMENTAL_NS_END +}} // namespace -static testing::internal::ParamGenerator availableDnnTargets() +namespace opencv_test { +using namespace cv::dnn; + +static testing::internal::ParamGenerator availableDnnTargets() { - static std::vector targets; + static std::vector targets; if (targets.empty()) { targets.push_back(DNN_TARGET_CPU); @@ -69,23 +95,23 @@ static testing::internal::ParamGenerator availableDnnTargets() return testing::ValuesIn(targets); } -static testing::internal::ParamGenerator > dnnBackendsAndTargets() +static testing::internal::ParamGenerator > dnnBackendsAndTargets() { - static const tuple testCases[] = { + static const tuple testCases[] = { #ifdef HAVE_INF_ENGINE - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), - tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16), + tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD), #endif - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU), - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), - tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU), + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL), + tuple(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16) }; return testing::ValuesIn(testCases); } -class DNNTestLayer : public TestWithParam > +class DNNTestLayer : public TestWithParam > { public: dnn::Backend backend; @@ -156,6 +182,5 @@ protected: } }; -}} - +} // namespace #endif diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 8f3822cc96..66b9d4f642 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -243,7 +243,7 @@ TEST_P(Test_TensorFlow_layers, l2_normalize_3d) runTensorFlowNet("l2_normalize_3d"); } -typedef testing::TestWithParam Test_TensorFlow_nets; +typedef testing::TestWithParam Test_TensorFlow_nets; TEST_P(Test_TensorFlow_nets, MobileNet_SSD) { diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index c1abdc930d..37966a1f93 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -100,7 +100,7 @@ static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String out } } -typedef testing::TestWithParam Test_Torch_layers; +typedef testing::TestWithParam Test_Torch_layers; TEST_P(Test_Torch_layers, run_convolution) { @@ -208,7 +208,7 @@ TEST_P(Test_Torch_layers, net_non_spatial) INSTANTIATE_TEST_CASE_P(/**/, Test_Torch_layers, availableDnnTargets()); -typedef testing::TestWithParam Test_Torch_nets; +typedef testing::TestWithParam Test_Torch_nets; TEST_P(Test_Torch_nets, OpenFace_accuracy) { From d7bd662c95084f4d559feb7fa7ee75bdf0b02cc4 Mon Sep 17 00:00:00 2001 From: catree Date: Tue, 10 Jul 2018 15:35:46 +0200 Subject: [PATCH 2/3] Add a note in the documentation about Mat::ones and mat::eye. With multi-channels type (e.g. CV_8UC3), only the first channel is treated. --- modules/core/include/opencv2/core/mat.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 7be97c803e..2d1d8e08f4 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -1324,7 +1324,7 @@ public: /** @brief Returns a zero array of the specified size and type. The method returns a Matlab-style zero array initializer. It can be used to quickly form a constant - array as a function parameter, part of a matrix expression, or as a matrix initializer. : + array as a function parameter, part of a matrix expression, or as a matrix initializer: @code Mat A; A = Mat::zeros(3, 3, CV_32F); @@ -1360,6 +1360,8 @@ public: The above operation does not form a 100x100 matrix of 1's and then multiply it by 3. Instead, it just remembers the scale factor (3 in this case) and use it when actually invoking the matrix initializer. + @note In case of multi-channels type, only the first channel will be initialized with 1's, the + others will be set to 0's. @param rows Number of rows. @param cols Number of columns. @param type Created matrix type. @@ -1387,6 +1389,8 @@ public: // make a 4x4 diagonal matrix with 0.1's on the diagonal. Mat A = Mat::eye(4, 4, CV_32F)*0.1; @endcode + @note In case of multi-channels type, identity matrix will be initialized only for the first channel, + the others will be set to 0's @param rows Number of rows. @param cols Number of columns. @param type Created matrix type. From 4c5a86828a81cc3445c102004591157f28f331b3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Tue, 10 Jul 2018 12:43:03 +0800 Subject: [PATCH 3/3] Fix gemmlike convolution input reading use vload3 for half3 or float3 input vector reading, also check read position to see if it exceed input width Signed-off-by: Li Peng --- modules/dnn/src/opencl/conv_layer_spatial.cl | 169 +++++++++++++------ 1 file changed, 115 insertions(+), 54 deletions(-) diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index dc7b047fe5..2cc161d3ff 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -467,7 +467,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset @@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read); + Dtype* pblockA00 = (Dtype*)(&blockA00); + #else Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -564,17 +572,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS @@ -621,7 +630,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset @@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -730,17 +742,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS Dtype bias[4]; @@ -849,11 +862,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 - Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; - Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH; + Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH; Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA01 = (Dtype*)(&blockA01); + #else + Dtype_t blockA00 = { (Dtype)0.f }; + Dtype_t blockA01 = { (Dtype)0.f }; + Dtype* pblockA00 = (Dtype*)(&blockA00); + Dtype* pblockA01 = (Dtype*)(&blockA01); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_x0 + pos < input_width) + pblockA00[pos] = src0_read0[pos]; + + if (curr_x1 + pos < input_width) + pblockA01[pos] = src0_read1[pos]; + }) + src0_read0 += ROW_PITCH; + src0_read1 += ROW_PITCH; + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug @@ -980,14 +1018,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset #if APPLY_BIAS Dtype bias[4]; @@ -1049,11 +1087,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug @@ -1193,14 +1238,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out1 = dst + out1_offset; #if APPLY_BIAS @@ -1352,9 +1397,9 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset - + curr_x - INPUT_PAD_W; // x offset + + curr_x - INPUT_PAD_W; // x offset const __global Dtype *src0_read_orig = src0_read; // Src1 (filter) is directly used as btile. @@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read); + Dtype* pblockA00 = (Dtype*)(&blockA00); + #else Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1463,17 +1516,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS @@ -1556,11 +1610,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); } //while ( ++patch_depth < 1 ); //debug @@ -1675,14 +1736,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset #if APPLY_BIAS Dtype bias[2];