Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/17177/head
Alexander Alekhin 5 years ago
commit c722625f28
  1. 72
      3rdparty/protobuf/src/google/protobuf/text_format.cc
  2. 17
      3rdparty/protobuf/src/google/protobuf/text_format.h
  3. 35
      cmake/OpenCVFindProtobuf.cmake
  4. 13
      modules/dnn/CMakeLists.txt
  5. 15
      modules/dnn/src/caffe/caffe_io.cpp
  6. 60
      modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
  7. 12
      modules/dnn/src/tensorflow/tf_importer.cpp
  8. 16
      modules/dnn/test/test_tf_importer.cpp
  9. 55
      modules/objdetect/src/hog.cpp
  10. 145
      modules/objdetect/src/opencl/objdetect_hog.cl
  11. 4
      samples/dnn/face_detector/train.prototxt

@ -225,7 +225,9 @@ class TextFormat::Parser::ParserImpl {
bool allow_unknown_enum,
bool allow_field_number,
bool allow_relaxed_whitespace,
bool allow_partial)
bool allow_partial,
int recursion_limit // backported from 3.8.0
)
: error_collector_(error_collector),
finder_(finder),
parse_info_tree_(parse_info_tree),
@ -238,7 +240,9 @@ class TextFormat::Parser::ParserImpl {
allow_unknown_enum_(allow_unknown_enum),
allow_field_number_(allow_field_number),
allow_partial_(allow_partial),
had_errors_(false) {
had_errors_(false),
recursion_limit_(recursion_limit) // backported from 3.8.0
{
// For backwards-compatibility with proto1, we need to allow the 'f' suffix
// for floats.
tokenizer_.set_allow_f_after_float(true);
@ -490,9 +494,9 @@ class TextFormat::Parser::ParserImpl {
if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) {
UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count());
unknown_field->AddLengthDelimited(0, field_name); // Add a field's name.
return SkipFieldValue(unknown_field);
return SkipFieldValue(unknown_field, recursion_limit_);
} else {
return SkipFieldMessage(unknown_fields);
return SkipFieldMessage(unknown_fields, recursion_limit_);
}
}
@ -575,7 +579,14 @@ label_skip_parsing:
}
// Skips the next field including the field's name and value.
bool SkipField(UnknownFieldSet* unknown_fields) {
bool SkipField(UnknownFieldSet* unknown_fields, int recursion_limit) {
// OpenCV specific
if (--recursion_limit < 0) {
ReportError("Message is too deep (SkipField)");
return false;
}
string field_name;
if (TryConsume("[")) {
// Extension name.
@ -594,9 +605,9 @@ label_skip_parsing:
if (TryConsume(":") && !LookingAt("{") && !LookingAt("<")) {
UnknownFieldSet* unknown_field = unknown_fields->AddGroup(unknown_fields->field_count());
unknown_field->AddLengthDelimited(0, field_name); // Add a field's name.
DO(SkipFieldValue(unknown_field));
DO(SkipFieldValue(unknown_field, recursion_limit));
} else {
DO(SkipFieldMessage(unknown_fields));
DO(SkipFieldMessage(unknown_fields, recursion_limit));
}
// For historical reasons, fields may optionally be separated by commas or
// semicolons.
@ -608,6 +619,12 @@ label_skip_parsing:
const Reflection* reflection,
const FieldDescriptor* field) {
// backported from 3.8.0
if (--recursion_limit_ < 0) {
ReportError("Message is too deep");
return false;
}
// If the parse information tree is not NULL, create a nested one
// for the nested message.
ParseInfoTree* parent = parse_info_tree_;
@ -624,6 +641,9 @@ label_skip_parsing:
delimiter));
}
// backported from 3.8.0
++recursion_limit_;
// Reset the parse information tree.
parse_info_tree_ = parent;
return true;
@ -631,11 +651,17 @@ label_skip_parsing:
// Skips the whole body of a message including the beginning delimiter and
// the ending delimiter.
bool SkipFieldMessage(UnknownFieldSet* unknown_fields) {
bool SkipFieldMessage(UnknownFieldSet* unknown_fields, int recursion_limit) {
// OpenCV specific
if (--recursion_limit < 0) {
ReportError("Message is too deep (SkipFieldMessage)");
return false;
}
string delimiter;
DO(ConsumeMessageDelimiter(&delimiter));
while (!LookingAt(">") && !LookingAt("}")) {
DO(SkipField(unknown_fields));
DO(SkipField(unknown_fields, recursion_limit));
}
DO(Consume(delimiter));
return true;
@ -775,7 +801,14 @@ label_skip_parsing:
return true;
}
bool SkipFieldValue(UnknownFieldSet* unknown_field) {
bool SkipFieldValue(UnknownFieldSet* unknown_field, int recursion_limit) {
// OpenCV specific
if (--recursion_limit < 0) {
ReportError("Message is too deep (SkipFieldValue)");
return false;
}
if (LookingAtType(io::Tokenizer::TYPE_STRING)) {
while (LookingAtType(io::Tokenizer::TYPE_STRING)) {
tokenizer_.Next();
@ -785,9 +818,9 @@ label_skip_parsing:
if (TryConsume("[")) {
while (true) {
if (!LookingAt("{") && !LookingAt("<")) {
DO(SkipFieldValue(unknown_field));
DO(SkipFieldValue(unknown_field, recursion_limit));
} else {
DO(SkipFieldMessage(unknown_field));
DO(SkipFieldMessage(unknown_field, recursion_limit));
}
if (TryConsume("]")) {
break;
@ -1156,6 +1189,7 @@ label_skip_parsing:
const bool allow_field_number_;
const bool allow_partial_;
bool had_errors_;
int recursion_limit_; // backported from 3.8.0
};
#undef DO
@ -1306,17 +1340,19 @@ class TextFormat::Printer::TextGenerator
TextFormat::Finder::~Finder() {
}
TextFormat::Parser::Parser(bool allow_unknown_field)
TextFormat::Parser::Parser()
: error_collector_(NULL),
finder_(NULL),
parse_info_tree_(NULL),
allow_partial_(false),
allow_case_insensitive_field_(false),
allow_unknown_field_(allow_unknown_field),
allow_unknown_field_(false),
allow_unknown_enum_(false),
allow_field_number_(false),
allow_relaxed_whitespace_(false),
allow_singular_overwrites_(false) {
allow_singular_overwrites_(false),
recursion_limit_(std::numeric_limits<int>::max())
{
}
TextFormat::Parser::~Parser() {}
@ -1335,7 +1371,7 @@ bool TextFormat::Parser::Parse(io::ZeroCopyInputStream* input,
overwrites_policy,
allow_case_insensitive_field_, allow_unknown_field_,
allow_unknown_enum_, allow_field_number_,
allow_relaxed_whitespace_, allow_partial_);
allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
return MergeUsingImpl(input, output, &parser);
}
@ -1353,7 +1389,7 @@ bool TextFormat::Parser::Merge(io::ZeroCopyInputStream* input,
ParserImpl::ALLOW_SINGULAR_OVERWRITES,
allow_case_insensitive_field_, allow_unknown_field_,
allow_unknown_enum_, allow_field_number_,
allow_relaxed_whitespace_, allow_partial_);
allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
return MergeUsingImpl(input, output, &parser);
}
@ -1388,7 +1424,7 @@ bool TextFormat::Parser::ParseFieldValueFromString(
ParserImpl::ALLOW_SINGULAR_OVERWRITES,
allow_case_insensitive_field_, allow_unknown_field_,
allow_unknown_enum_, allow_field_number_,
allow_relaxed_whitespace_, allow_partial_);
allow_relaxed_whitespace_, allow_partial_, recursion_limit_);
return parser.ParseField(field, output);
}

@ -457,7 +457,7 @@ class LIBPROTOBUF_EXPORT TextFormat {
// For more control over parsing, use this class.
class LIBPROTOBUF_EXPORT Parser {
public:
Parser(bool allow_unknown_field = false);
Parser();
~Parser();
// Like TextFormat::Parse().
@ -508,10 +508,24 @@ class LIBPROTOBUF_EXPORT TextFormat {
Message* output);
// backported from 3.8.0
// When an unknown field is met, parsing will fail if this option is set
// to false(the default). If true, unknown fields will be ignored and
// a warning message will be generated.
// Please aware that set this option true may hide some errors (e.g.
// spelling error on field name). Avoid to use this option if possible.
void AllowUnknownField(bool allow) { allow_unknown_field_ = allow; }
void AllowFieldNumber(bool allow) {
allow_field_number_ = allow;
}
// backported from 3.8.0
// Sets maximum recursion depth which parser can use. This is effectively
// the maximum allowed nesting of proto messages.
void SetRecursionLimit(int limit) { recursion_limit_ = limit; }
private:
// Forward declaration of an internal class used to parse text
// representations (see text_format.cc for implementation).
@ -533,6 +547,7 @@ class LIBPROTOBUF_EXPORT TextFormat {
bool allow_field_number_;
bool allow_relaxed_whitespace_;
bool allow_singular_overwrites_;
int recursion_limit_; // backported from 3.8.0
};

@ -6,9 +6,15 @@ if(NOT WITH_PROTOBUF)
return()
endif()
ocv_option(BUILD_PROTOBUF "Force to build libprotobuf from sources" ON)
ocv_option(BUILD_PROTOBUF "Force to build libprotobuf runtime from sources" ON)
ocv_option(PROTOBUF_UPDATE_FILES "Force rebuilding .proto files (protoc should be available)" OFF)
# BUILD_PROTOBUF=OFF: Custom manual protobuf configuration (see find_package(Protobuf) for details):
# - Protobuf_INCLUDE_DIR
# - Protobuf_LIBRARY
# - Protobuf_PROTOC_EXECUTABLE
function(get_protobuf_version version include)
file(STRINGS "${include}/google/protobuf/stubs/common.h" ver REGEX "#define GOOGLE_PROTOBUF_VERSION [0-9]+")
string(REGEX MATCHALL "[0-9]+" ver ${ver})
@ -19,7 +25,9 @@ function(get_protobuf_version version include)
endfunction()
if(BUILD_PROTOBUF)
ocv_assert(NOT PROTOBUF_UPDATE_FILES)
add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/protobuf")
set(Protobuf_LIBRARIES "libprotobuf")
set(HAVE_PROTOBUF TRUE)
else()
unset(Protobuf_VERSION CACHE)
@ -44,10 +52,7 @@ else()
if(Protobuf_FOUND)
if(TARGET protobuf::libprotobuf)
add_library(libprotobuf INTERFACE IMPORTED)
set_target_properties(libprotobuf PROPERTIES
INTERFACE_LINK_LIBRARIES protobuf::libprotobuf
)
set(Protobuf_LIBRARIES "protobuf::libprotobuf")
else()
add_library(libprotobuf UNKNOWN IMPORTED)
set_target_properties(libprotobuf PROPERTIES
@ -56,21 +61,31 @@ else()
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${Protobuf_INCLUDE_DIR}"
)
get_protobuf_version(Protobuf_VERSION "${Protobuf_INCLUDE_DIR}")
set(Protobuf_LIBRARIES "libprotobuf")
endif()
set(HAVE_PROTOBUF TRUE)
endif()
endif()
if(HAVE_PROTOBUF AND PROTOBUF_UPDATE_FILES AND NOT COMMAND PROTOBUF_GENERATE_CPP)
find_package(Protobuf QUIET)
if(NOT COMMAND PROTOBUF_GENERATE_CPP)
message(FATAL_ERROR "PROTOBUF_GENERATE_CPP command is not available")
endif()
message(FATAL_ERROR "Can't configure protobuf dependency (BUILD_PROTOBUF=${BUILD_PROTOBUF} PROTOBUF_UPDATE_FILES=${PROTOBUF_UPDATE_FILES})")
endif()
if(HAVE_PROTOBUF)
list(APPEND CUSTOM_STATUS protobuf)
if(NOT BUILD_PROTOBUF)
if(TARGET "${Protobuf_LIBRARIES}")
get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION_RELEASE)
if(NOT __location)
get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION)
endif()
elseif(Protobuf_LIBRARY)
set(__location "${Protobuf_LIBRARY}")
else()
set(__location "${Protobuf_LIBRARIES}")
endif()
endif()
list(APPEND CUSTOM_STATUS_protobuf " Protobuf:"
BUILD_PROTOBUF THEN "build (${Protobuf_VERSION})"
ELSE "${Protobuf_LIBRARY} (${Protobuf_VERSION})")
ELSE "${__location} (${Protobuf_VERSION})")
endif()

@ -72,6 +72,9 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS
-Winvalid-offsetof # Apple Clang (attr_value.pb.cc)
)
set(include_dirs "")
set(libs "")
if(PROTOBUF_UPDATE_FILES)
file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto")
set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow
@ -82,11 +85,15 @@ else()
set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx")
endif()
set(include_dirs ${fw_inc})
set(sources_options "")
list(APPEND include_dirs ${fw_inc})
list(APPEND libs ${Protobuf_LIBRARIES})
if(NOT BUILD_PROTOBUF)
list(APPEND include_dirs ${Protobuf_INCLUDE_DIRS})
endif()
set(libs libprotobuf ${LAPACK_LIBRARIES})
set(sources_options "")
list(APPEND libs ${LAPACK_LIBRARIES})
if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS})
else()

@ -1120,11 +1120,12 @@ bool ReadProtoFromTextFile(const char* filename, Message* proto) {
std::ifstream fs(filename, std::ifstream::in);
CHECK(fs.is_open()) << "Can't open \"" << filename << "\"";
IstreamInputStream input(&fs);
google::protobuf::TextFormat::Parser parser;
#ifndef OPENCV_DNN_EXTERNAL_PROTOBUF
return google::protobuf::TextFormat::Parser(true).Parse(&input, proto);
#else
return google::protobuf::TextFormat::Parser().Parse(&input, proto);
parser.AllowUnknownField(true);
parser.SetRecursionLimit(1000);
#endif
return parser.Parse(&input, proto);
}
bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
@ -1137,12 +1138,12 @@ bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
bool ReadProtoFromTextBuffer(const char* data, size_t len, Message* proto) {
ArrayInputStream input(data, len);
google::protobuf::TextFormat::Parser parser;
#ifndef OPENCV_DNN_EXTERNAL_PROTOBUF
return google::protobuf::TextFormat::Parser(true).Parse(&input, proto);
#else
return google::protobuf::TextFormat::Parser().Parse(&input, proto);
parser.AllowUnknownField(true);
parser.SetRecursionLimit(1000);
#endif
return parser.Parse(&input, proto);
}

@ -223,6 +223,26 @@ public:
}
};
class FlattenProdSubgraph : public Subgraph
{
public:
FlattenProdSubgraph()
{
int input = addNodeToMatch("");
int shape = addNodeToMatch("Shape", input);
int stack = addNodeToMatch("Const");
int stack_1 = addNodeToMatch("Const");
int stack_2 = addNodeToMatch("Const");
int strided_slice = addNodeToMatch("StridedSlice", shape, stack, stack_1, stack_2);
int prod = addNodeToMatch("Prod", strided_slice, addNodeToMatch("Const"));
int shape_pack = addNodeToMatch("Const");
int pack = addNodeToMatch("Pack", shape_pack, prod);
addNodeToMatch("Reshape", input, pack);
setFusedNode("Flatten", input);
}
};
// K.layers.Softmax
class SoftMaxKerasSubgraph : public Subgraph
{
@ -629,6 +649,36 @@ public:
}
};
class PReLUSubgraph : public TFSubgraph
{
public:
PReLUSubgraph(bool negativeScales_) : negativeScales(negativeScales_)
{
int input = addNodeToMatch("");
int scales = addNodeToMatch("Const");
int neg = addNodeToMatch("Neg", input);
int relu_neg = addNodeToMatch("Relu", neg);
int finalScales = negativeScales ? addNodeToMatch("Neg", scales) : scales;
int mul = addNodeToMatch("Mul", finalScales, relu_neg);
int relu_pos = addNodeToMatch("Relu", input);
addNodeToMatch("Add", relu_pos, mul);
setFusedNode("PReLU", input, scales);
}
virtual void finalize(tensorflow::GraphDef&, tensorflow::NodeDef* fusedNode,
std::vector<tensorflow::NodeDef*>& inputNodes) CV_OVERRIDE
{
if (!negativeScales)
{
Mat scales = getTensorContent(inputNodes[1]->attr().at("value").tensor(), /*copy*/false);
scales *= -1;
}
}
private:
bool negativeScales;
};
void simplifySubgraphs(tensorflow::GraphDef& net)
{
std::vector<Ptr<Subgraph> > subgraphs;
@ -649,6 +699,16 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
subgraphs.push_back(Ptr<Subgraph>(new SoftMaxSlimV2Subgraph()));
subgraphs.push_back(Ptr<Subgraph>(new ReshapeAsShapeSubgraph()));
subgraphs.push_back(Ptr<Subgraph>(new KerasMVNSubgraph()));
subgraphs.push_back(Ptr<Subgraph>(new PReLUSubgraph(true)));
subgraphs.push_back(Ptr<Subgraph>(new PReLUSubgraph(false)));
subgraphs.push_back(Ptr<Subgraph>(new FlattenProdSubgraph()));
for (int i = 0; i < net.node_size(); ++i)
{
tensorflow::NodeDef* layer = net.mutable_node(i);
if (layer->op() == "AddV2")
layer->set_op("Add");
}
simplifySubgraphs(Ptr<ImportGraphWrapper>(new TFGraphWrapper(net)), subgraphs);
}

@ -1231,6 +1231,7 @@ void TFImporter::populateNet(Net dstNet)
// Only NHWC <-> NCHW permutations are allowed. OpenCV is always
// keep NCHW layout this way.
int inpLayout = getDataLayout(layer.input(0), data_layouts);
std::string type = "Identity";
if (inpLayout == DATA_LAYOUT_NHWC)
{
if (permData[0] == 0 && permData[1] == 3 && permData[2] == 1 && permData[3] == 2)
@ -1245,6 +1246,15 @@ void TFImporter::populateNet(Net dstNet)
// in OpenCV: NCHW->NCHW
data_layouts[name] = DATA_LAYOUT_NHWC;
}
else if (permData[0] == 0 && permData[1] == 3 && permData[2] == 2 && permData[3] == 1)
{
// in TensorFlow: NHWC->NCWH
// in OpenCV: NCHW->NCWH
int permData[] = {0, 1, 3, 2};
layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
data_layouts[name] = DATA_LAYOUT_NCHW; // we keep track NCHW because channels position only matters
type = "Permute";
}
else
CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
}
@ -1265,7 +1275,7 @@ void TFImporter::populateNet(Net dstNet)
else
CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
}
int id = dstNet.addLayer(name, "Identity", layerParams);
int id = dstNet.addLayer(name, type, layerParams);
layer_id[name] = id;
connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
}

@ -1035,11 +1035,25 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear)
runTensorFlowNet("resize_bilinear_factor");
}
TEST_P(Test_TensorFlow_layers, tf2_keras)
TEST_P(Test_TensorFlow_layers, tf2_dense)
{
runTensorFlowNet("tf2_dense");
}
TEST_P(Test_TensorFlow_layers, tf2_prelu)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
runTensorFlowNet("tf2_prelu");
}
TEST_P(Test_TensorFlow_layers, tf2_permute_nhwc_ncwh)
{
runTensorFlowNet("tf2_permute_nhwc_ncwh");
}
TEST_P(Test_TensorFlow_layers, squeeze)
{
#if defined(INF_ENGINE_RELEASE)

@ -1218,15 +1218,6 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y,
UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
{
ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
if(k.empty())
return false;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
@ -1287,19 +1278,10 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
size_t localThreads[3] = { 1, 1, 1 };
int idx = 0;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
ocl::Kernel k;
if ( nbins == 9 )
{
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
if(k.empty())
return false;
@ -1311,14 +1293,7 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
}
else
{
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
@ -1736,7 +1711,6 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
{
int nthreads;
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
cv::String opts;
ocl::Kernel k;
@ -1745,14 +1719,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
{
case 180:
nthreads = 180;
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
idx = k.set(idx, descr_size.width);
@ -1761,14 +1728,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
case 252:
nthreads = 256;
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
idx = k.set(idx, descr_size.width);
@ -1777,14 +1737,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
default:
nthreads = 256;
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
if(k.empty())
return false;
if(is_cpu)
opts = "-D CPU ";
else
opts = cv::format("-D WAVE_SIZE=%zu", k.preferedWorkGroupSizeMultiple());
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
if(k.empty())
return false;
idx = k.set(idx, descr_size.area());

@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel(
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE); }
#ifdef CPU
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE); }
#else
if (tid < 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
}
#endif
return sum;
}
@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel(
hist[0] = elem * scale;
}
#define reduce_with_sync(target, sharedMemory, localMemory, tid, offset) \
if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \
barrier(CLK_LOCAL_MEM_FENCE);
//---------------------------------------------------------------------
// Linear SVM based classification
// 48x96 window, 9 bins and default parameters
@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel(
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 90) products[tid] = product = product + products[tid + 90];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 45) products[tid] = product = product + products[tid + 45];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 13)
{
smem[tid] = product = product + smem[tid + 32];
}
#if WAVE_SIZE < 32
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 16)
{
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
reduce_with_sync(90, products, product, tid, 90);
reduce_with_sync(45, products, product, tid, 45);
reduce_with_sync(13, products, product, tid, 32); // 13 is not typo
reduce_with_sync(16, products, product, tid, 16);
reduce_with_sync(8, products, product, tid, 8);
reduce_with_sync(4, products, product, tid, 4);
reduce_with_sync(2, products, product, tid, 2);
if (tid == 0){
product = product + smem[tid + 1];
product = product + products[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel(
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
reduce_with_sync(128, products, product, tid, 128);
reduce_with_sync(64, products, product, tid, 64);
reduce_with_sync(32, products, product, tid, 32);
reduce_with_sync(16, products, product, tid, 16);
reduce_with_sync(8, products, product, tid, 8);
reduce_with_sync(4, products, product, tid, 4);
reduce_with_sync(2, products, product, tid, 2);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
product = product + smem[tid + 1];
product = product + products[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
@ -459,40 +394,16 @@ __kernel void classify_hists_kernel(
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
reduce_with_sync(128, products, product, tid, 128);
reduce_with_sync(64, products, product, tid, 64);
reduce_with_sync(32, products, product, tid, 32);
reduce_with_sync(16, products, product, tid, 16);
reduce_with_sync(8, products, product, tid, 8);
reduce_with_sync(4, products, product, tid, 4);
reduce_with_sync(2, products, product, tid, 2);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
smem[tid] = product = product + smem[tid + 1];
products[tid] = product = product + products[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}

@ -1020,7 +1020,7 @@ layer {
}
convolution_param {
num_output: 128
pad: 1
pad: 0
kernel_size: 3
stride: 1
weight_filler {
@ -1600,7 +1600,7 @@ layer {
}
convolution_param {
num_output: 16
pad: 0
pad: 1
kernel_size: 3
stride: 1
weight_filler {

Loading…
Cancel
Save