From d9910542026a55a289e41955d7cbc2bcf0e29060 Mon Sep 17 00:00:00 2001 From: Aleksandr Rybnikov Date: Wed, 30 Aug 2017 12:50:17 +0300 Subject: [PATCH] Added files for face detector sample --- samples/dnn/CMakeLists.txt | 21 + samples/dnn/face_detector/.gitignore | 1 + samples/dnn/face_detector/deploy.prototxt | 1789 ++++++++++++++++ .../how_to_train_face_detector.txt | 79 + samples/dnn/face_detector/solver.prototxt | 28 + samples/dnn/face_detector/test.prototxt | 1830 ++++++++++++++++ samples/dnn/face_detector/train.prototxt | 1898 +++++++++++++++++ samples/dnn/resnet_ssd_face_python.py | 59 + 8 files changed, 5705 insertions(+) create mode 100644 samples/dnn/face_detector/.gitignore create mode 100644 samples/dnn/face_detector/deploy.prototxt create mode 100644 samples/dnn/face_detector/how_to_train_face_detector.txt create mode 100644 samples/dnn/face_detector/solver.prototxt create mode 100644 samples/dnn/face_detector/test.prototxt create mode 100644 samples/dnn/face_detector/train.prototxt create mode 100644 samples/dnn/resnet_ssd_face_python.py diff --git a/samples/dnn/CMakeLists.txt b/samples/dnn/CMakeLists.txt index 83ac3cd3dc..1e7f8fe830 100644 --- a/samples/dnn/CMakeLists.txt +++ b/samples/dnn/CMakeLists.txt @@ -8,6 +8,27 @@ ocv_check_dependencies(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS}) if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND) project(dnn_samples) + # Model branch name: dnn_samples_face_detector_20170830 + set(DNN_FACE_DETECTOR_MODEL_COMMIT "b2bfc75f6aea5b1f834ff0f0b865a7c18ff1459f") + set(DNN_FACE_DETECTOR_MODEL_HASH "afbb6037fd180e8d2acb3b58ca737b9e") + + set(DNN_FACE_DETECTOR_MODEL_NAME "res10_300x300_ssd_iter_140000.caffemodel") + + set(DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR "${CMAKE_CURRENT_LIST_DIR}/face_detector") + + if(COMMAND ocv_download) + ocv_download(FILENAME ${DNN_FACE_DETECTOR_MODEL_NAME} + HASH ${DNN_FACE_DETECTOR_MODEL_HASH} + URL + "$ENV{OPENCV_DNN_MODELS_URL}" + "${OPENCV_DNN_MODELS_URL}" + "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${DNN_FACE_DETECTOR_MODEL_COMMIT}/" + DESTINATION_DIR ${DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR} + ID DNN_FACE_DETECTOR + RELATIVE_URL + STATUS res) + endif() + ocv_include_directories("${OpenCV_SOURCE_DIR}/include") ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS}) diff --git a/samples/dnn/face_detector/.gitignore b/samples/dnn/face_detector/.gitignore new file mode 100644 index 0000000000..6fa25e16eb --- /dev/null +++ b/samples/dnn/face_detector/.gitignore @@ -0,0 +1 @@ +res10_300x300_ssd_iter_140000.caffemodel diff --git a/samples/dnn/face_detector/deploy.prototxt b/samples/dnn/face_detector/deploy.prototxt new file mode 100644 index 0000000000..f3eba00d3c --- /dev/null +++ b/samples/dnn/face_detector/deploy.prototxt @@ -0,0 +1,1789 @@ +input: "data" +input_shape { + dim: 1 + dim: 3 + dim: 300 + dim: 300 +} + +layer { + name: "data_bn" + type: "BatchNorm" + bottom: "data" + top: "data_bn" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "data_scale" + type: "Scale" + bottom: "data_bn" + top: "data_bn" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_h" + type: "Convolution" + bottom: "data_bn" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv1_bn_h" + type: "BatchNorm" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "conv1_scale_h" + type: "Scale" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_relu" + type: "ReLU" + bottom: "conv1_h" + top: "conv1_h" +} +layer { + name: "conv1_pool" + type: "Pooling" + bottom: "conv1_h" + top: "conv1_pool" + pooling_param { + kernel_size: 3 + stride: 2 + } +} +layer { + name: "layer_64_1_conv1_h" + type: "Convolution" + bottom: "conv1_pool" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_bn2_h" + type: "BatchNorm" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_64_1_scale2_h" + type: "Scale" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_64_1_relu2" + type: "ReLU" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" +} +layer { + name: "layer_64_1_conv2_h" + type: "Convolution" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_sum" + type: "Eltwise" + bottom: "layer_64_1_conv2_h" + bottom: "conv1_pool" + top: "layer_64_1_sum" +} +layer { + name: "layer_128_1_bn1_h" + type: "BatchNorm" + bottom: "layer_64_1_sum" + top: "layer_128_1_bn1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale1_h" + type: "Scale" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu1" + type: "ReLU" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" +} +layer { + name: "layer_128_1_conv1_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_bn2" + type: "BatchNorm" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale2" + type: "Scale" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu2" + type: "ReLU" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" +} +layer { + name: "layer_128_1_conv2" + type: "Convolution" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_conv_expand_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_sum" + type: "Eltwise" + bottom: "layer_128_1_conv2" + bottom: "layer_128_1_conv_expand_h" + top: "layer_128_1_sum" +} +layer { + name: "layer_256_1_bn1" + type: "BatchNorm" + bottom: "layer_128_1_sum" + top: "layer_256_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale1" + type: "Scale" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu1" + type: "ReLU" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" +} +layer { + name: "layer_256_1_conv1" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_bn2" + type: "BatchNorm" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale2" + type: "Scale" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu2" + type: "ReLU" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" +} +layer { + name: "layer_256_1_conv2" + type: "Convolution" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_conv_expand" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv_expand" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_sum" + type: "Eltwise" + bottom: "layer_256_1_conv2" + bottom: "layer_256_1_conv_expand" + top: "layer_256_1_sum" +} +layer { + name: "layer_512_1_bn1" + type: "BatchNorm" + bottom: "layer_256_1_sum" + top: "layer_512_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale1" + type: "Scale" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu1" + type: "ReLU" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" +} +layer { + name: "layer_512_1_conv1_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_bn2_h" + type: "BatchNorm" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale2_h" + type: "Scale" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu2" + type: "ReLU" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" +} +layer { + name: "layer_512_1_conv2_h" + type: "Convolution" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 2 # 1 + kernel_size: 3 + stride: 1 + dilation: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_conv_expand_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_sum" + type: "Eltwise" + bottom: "layer_512_1_conv2_h" + bottom: "layer_512_1_conv_expand_h" + top: "layer_512_1_sum" +} +layer { + name: "last_bn_h" + type: "BatchNorm" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "last_scale_h" + type: "Scale" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "last_relu" + type: "ReLU" + bottom: "layer_512_1_sum" + top: "fc7" +} + +layer { + name: "conv6_1_h" + type: "Convolution" + bottom: "fc7" + top: "conv6_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1_h" + top: "conv6_1_h" +} +layer { + name: "conv6_2_h" + type: "Convolution" + bottom: "conv6_1_h" + top: "conv6_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2_h" + top: "conv6_2_h" +} +layer { + name: "conv7_1_h" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv7_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1_h" + top: "conv7_1_h" +} +layer { + name: "conv7_2_h" + type: "Convolution" + bottom: "conv7_1_h" + top: "conv7_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2_h" + top: "conv7_2_h" +} +layer { + name: "conv8_1_h" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv8_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1_h" + top: "conv8_1_h" +} +layer { + name: "conv8_2_h" + type: "Convolution" + bottom: "conv8_1_h" + top: "conv8_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2_h" + top: "conv8_2_h" +} +layer { + name: "conv9_1_h" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv9_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_1_relu" + type: "ReLU" + bottom: "conv9_1_h" + top: "conv9_1_h" +} +layer { + name: "conv9_2_h" + type: "Convolution" + bottom: "conv9_1_h" + top: "conv9_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_relu" + type: "ReLU" + bottom: "conv9_2_h" + top: "conv9_2_h" +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "layer_256_1_bn1" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + max_size: 60.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 8 + offset: 0.5 + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 111.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 16 + offset: 0.5 + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2_h" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 111.0 + max_size: 162.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 32 + offset: 0.5 + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2_h" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 162.0 + max_size: 213.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 64 + offset: 0.5 + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2_h" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 213.0 + max_size: 264.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 100 + offset: 0.5 + } +} +layer { + name: "conv9_2_mbox_loc" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_loc_perm" + type: "Permute" + bottom: "conv9_2_mbox_loc" + top: "conv9_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv9_2_mbox_loc_perm" + top: "conv9_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_conf" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_conf_perm" + type: "Permute" + bottom: "conv9_2_mbox_conf" + top: "conv9_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv9_2_mbox_conf_perm" + top: "conv9_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv9_2_h" + bottom: "data" + top: "conv9_2_mbox_priorbox" + prior_box_param { + min_size: 264.0 + max_size: 315.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 300 + offset: 0.5 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "conv9_2_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "conv9_2_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "conv9_2_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} + +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 2 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} + +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 2 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 400 + } + code_type: CENTER_SIZE + keep_top_k: 200 + confidence_threshold: 0.01 + } +} \ No newline at end of file diff --git a/samples/dnn/face_detector/how_to_train_face_detector.txt b/samples/dnn/face_detector/how_to_train_face_detector.txt new file mode 100644 index 0000000000..78789d7ed7 --- /dev/null +++ b/samples/dnn/face_detector/how_to_train_face_detector.txt @@ -0,0 +1,79 @@ +This is a brief description of training process which has been used to get res10_300x300_ssd_iter_140000.caffemodel. +The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels). +The model was trained in Caffe framework on some huge and avaliable online dataset. + +1. Prepare training tools +You need to use "ssd" branch from this repository https://github.com/weiliu89/caffe/tree/ssd . Checkout this branch and built it (see instructions in repo's README) + +2. Prepare training data. +The data preparation pipeline can be represented as: + +(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training + +a) Find some datasets with face bounding boxes annotation. For some reasons I can't provide links here, but you easily find them on your own. Also study the data. It may contain small or low quality faces which can spoil training process. Often there are special flags about object quality in annotation. Remove such faces from annotation (smaller when 16 along at least one side, or blurred, of highly-occluded, or something else). + +b) The downloaded dataset will have some format of annotation. It may be one single file for all images, or separate file for each image or something else. But to train SSD in Caffe you need to convert annotation to PASCAL VOC format. +PASCAL VOC annoitation consist of .xml file for each image. In this xml file all face bounding boxes should be listed as: + + + + 300 + 300 + + + face + 0 + + 100 + 100 + 200 + 200 + + + + face + 0 + + 0 + 0 + 100 + 100 + + + + +So, convert your dataset's annotation to the fourmat above. +Also, you should create labelmap.prototxt file with the following content: +item { + name: "none_of_the_above" + label: 0 + display_name: "background" +} +item { + name: "face" + label: 1 + display_name: "face" +} + +You need this file to establish correspondence between name of class and digital label of class. + +For next step we also need file there all our image-annotation file names pairs are listed. This file should contain similar lines: +images_val/0.jpg annotations_val/0.jpg.xml + +c) To create LMDB you need to use create_data.sh tool from caffe/data/VOC0712 Caffe's source code directory. +This script calls create_annoset.py inside, so check out what you need to pass as script's arguments + +You need to prepare 2 LMDB databases: one for training images, one for validation images. + +3. Train your detector +For training you need to have 3 files: train.prototxt, test.prototxt and solver.prototxt. You can find these files in the same directory as for this readme. +Also you need to edit train.prototxt and test.prototxt to replace paths for your LMDB databases to actual databases you've crated in step 2. + +Now all is done for launch training process. +Execute next lines in Terminal: +mkdir -p snapshot +mkdir -p log +/path_for_caffe_build_dir/tools/caffe train -solver="solver.prototxt" -gpu 0 2>&1 | tee -a log/log.log + +And wait. It will take about 8 hours to finish the process. +After it you can use your .caffemodel from snapshot/ subdirectory in resnet_face_ssd_python.py sample. \ No newline at end of file diff --git a/samples/dnn/face_detector/solver.prototxt b/samples/dnn/face_detector/solver.prototxt new file mode 100644 index 0000000000..04bbc2ab78 --- /dev/null +++ b/samples/dnn/face_detector/solver.prototxt @@ -0,0 +1,28 @@ +train_net: "train.prototxt" +test_net: "test.prototxt" + +test_iter: 2312 +test_interval: 5000 +test_initialization: true + +base_lr: 0.01 +display: 10 +lr_policy: "multistep" +max_iter: 140000 +stepvalue: 80000 +stepvalue: 120000 +gamma: 0.1 +momentum: 0.9 +weight_decay: 0.0005 +average_loss: 500 +iter_size: 1 +type: "SGD" + +solver_mode: GPU +random_seed: 0 +debug_info: false +snapshot: 1000 +snapshot_prefix: "snapshot/res10_300x300_ssd" + +eval_type: "detection" +ap_version: "11point" \ No newline at end of file diff --git a/samples/dnn/face_detector/test.prototxt b/samples/dnn/face_detector/test.prototxt new file mode 100644 index 0000000000..fba593d87d --- /dev/null +++ b/samples/dnn/face_detector/test.prototxt @@ -0,0 +1,1830 @@ +layer { + name: "data" + type: "AnnotatedData" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mean_value: 104 + mean_value: 117 + mean_value: 123 + resize_param { + prob: 1 + resize_mode: WARP + height: 300 + width: 300 + interp_mode: LINEAR + } + emit_constraint { + emit_type: CENTER + } + } + data_param { + source: "val_lmdb/" + batch_size: 1 + backend: LMDB + } + annotated_data_param { + label_map_file: "labelmap.prototxt" + } +} + +layer { + name: "data_bn" + type: "BatchNorm" + bottom: "data" + top: "data_bn" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "data_scale" + type: "Scale" + bottom: "data_bn" + top: "data_bn" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_h" + type: "Convolution" + bottom: "data_bn" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv1_bn_h" + type: "BatchNorm" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "conv1_scale_h" + type: "Scale" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_relu" + type: "ReLU" + bottom: "conv1_h" + top: "conv1_h" +} +layer { + name: "conv1_pool" + type: "Pooling" + bottom: "conv1_h" + top: "conv1_pool" + pooling_param { + kernel_size: 3 + stride: 2 + } +} +layer { + name: "layer_64_1_conv1_h" + type: "Convolution" + bottom: "conv1_pool" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_bn2_h" + type: "BatchNorm" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_64_1_scale2_h" + type: "Scale" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_64_1_relu2" + type: "ReLU" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" +} +layer { + name: "layer_64_1_conv2_h" + type: "Convolution" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_sum" + type: "Eltwise" + bottom: "layer_64_1_conv2_h" + bottom: "conv1_pool" + top: "layer_64_1_sum" +} +layer { + name: "layer_128_1_bn1_h" + type: "BatchNorm" + bottom: "layer_64_1_sum" + top: "layer_128_1_bn1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale1_h" + type: "Scale" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu1" + type: "ReLU" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" +} +layer { + name: "layer_128_1_conv1_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_bn2" + type: "BatchNorm" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale2" + type: "Scale" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu2" + type: "ReLU" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" +} +layer { + name: "layer_128_1_conv2" + type: "Convolution" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_conv_expand_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_sum" + type: "Eltwise" + bottom: "layer_128_1_conv2" + bottom: "layer_128_1_conv_expand_h" + top: "layer_128_1_sum" +} +layer { + name: "layer_256_1_bn1" + type: "BatchNorm" + bottom: "layer_128_1_sum" + top: "layer_256_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale1" + type: "Scale" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu1" + type: "ReLU" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" +} +layer { + name: "layer_256_1_conv1" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_bn2" + type: "BatchNorm" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale2" + type: "Scale" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu2" + type: "ReLU" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" +} +layer { + name: "layer_256_1_conv2" + type: "Convolution" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_conv_expand" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv_expand" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_sum" + type: "Eltwise" + bottom: "layer_256_1_conv2" + bottom: "layer_256_1_conv_expand" + top: "layer_256_1_sum" +} +layer { + name: "layer_512_1_bn1" + type: "BatchNorm" + bottom: "layer_256_1_sum" + top: "layer_512_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale1" + type: "Scale" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu1" + type: "ReLU" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" +} +layer { + name: "layer_512_1_conv1_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_bn2_h" + type: "BatchNorm" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale2_h" + type: "Scale" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu2" + type: "ReLU" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" +} +layer { + name: "layer_512_1_conv2_h" + type: "Convolution" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 2 # 1 + kernel_size: 3 + stride: 1 + dilation: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_conv_expand_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_sum" + type: "Eltwise" + bottom: "layer_512_1_conv2_h" + bottom: "layer_512_1_conv_expand_h" + top: "layer_512_1_sum" +} +layer { + name: "last_bn_h" + type: "BatchNorm" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "last_scale_h" + type: "Scale" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "last_relu" + type: "ReLU" + bottom: "layer_512_1_sum" + top: "fc7" +} + +layer { + name: "conv6_1_h" + type: "Convolution" + bottom: "fc7" + top: "conv6_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1_h" + top: "conv6_1_h" +} +layer { + name: "conv6_2_h" + type: "Convolution" + bottom: "conv6_1_h" + top: "conv6_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2_h" + top: "conv6_2_h" +} +layer { + name: "conv7_1_h" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv7_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1_h" + top: "conv7_1_h" +} +layer { + name: "conv7_2_h" + type: "Convolution" + bottom: "conv7_1_h" + top: "conv7_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2_h" + top: "conv7_2_h" +} +layer { + name: "conv8_1_h" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv8_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1_h" + top: "conv8_1_h" +} +layer { + name: "conv8_2_h" + type: "Convolution" + bottom: "conv8_1_h" + top: "conv8_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2_h" + top: "conv8_2_h" +} +layer { + name: "conv9_1_h" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv9_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_1_relu" + type: "ReLU" + bottom: "conv9_1_h" + top: "conv9_1_h" +} +layer { + name: "conv9_2_h" + type: "Convolution" + bottom: "conv9_1_h" + top: "conv9_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_relu" + type: "ReLU" + bottom: "conv9_2_h" + top: "conv9_2_h" +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "layer_256_1_bn1" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + max_size: 60.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 8 + offset: 0.5 + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 111.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 16 + offset: 0.5 + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2_h" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 111.0 + max_size: 162.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 32 + offset: 0.5 + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2_h" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 162.0 + max_size: 213.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 64 + offset: 0.5 + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2_h" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 213.0 + max_size: 264.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 100 + offset: 0.5 + } +} +layer { + name: "conv9_2_mbox_loc" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_loc_perm" + type: "Permute" + bottom: "conv9_2_mbox_loc" + top: "conv9_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv9_2_mbox_loc_perm" + top: "conv9_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_conf" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_conf_perm" + type: "Permute" + bottom: "conv9_2_mbox_conf" + top: "conv9_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv9_2_mbox_conf_perm" + top: "conv9_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv9_2_h" + bottom: "data" + top: "conv9_2_mbox_priorbox" + prior_box_param { + min_size: 264.0 + max_size: 315.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 300 + offset: 0.5 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "conv9_2_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "conv9_2_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "conv9_2_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} + +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 2 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} + +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 2 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 400 + } + code_type: CENTER_SIZE + keep_top_k: 200 + confidence_threshold: 0.01 + } +} +layer { + name: "detection_eval" + type: "DetectionEvaluate" + bottom: "detection_out" + bottom: "label" + top: "detection_eval" + include { + phase: TEST + } + detection_evaluate_param { + num_classes: 2 + background_label_id: 0 + overlap_threshold: 0.5 + evaluate_difficult_gt: false + } +} \ No newline at end of file diff --git a/samples/dnn/face_detector/train.prototxt b/samples/dnn/face_detector/train.prototxt new file mode 100644 index 0000000000..78bc509ff7 --- /dev/null +++ b/samples/dnn/face_detector/train.prototxt @@ -0,0 +1,1898 @@ +layer { + name: "data" + type: "AnnotatedData" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + mean_value: 104 + mean_value: 117 + mean_value: 123 + resize_param { + prob: 1 + resize_mode: WARP + height: 300 + width: 300 + interp_mode: LINEAR + interp_mode: AREA + interp_mode: NEAREST + interp_mode: CUBIC + interp_mode: LANCZOS4 + } + emit_constraint { + emit_type: CENTER + } + distort_param { + brightness_prob: 0.5 + brightness_delta: 32 + contrast_prob: 0.5 + contrast_lower: 0.5 + contrast_upper: 1.5 + hue_prob: 0.5 + hue_delta: 18 + saturation_prob: 0.5 + saturation_lower: 0.5 + saturation_upper: 1.5 + random_order_prob: 0.0 + } + expand_param { + prob: 0.5 + max_expand_ratio: 4.0 + } + } + data_param { + source: "train_lmdb/" + batch_size: 16 + backend: LMDB + } + annotated_data_param { + batch_sampler { + max_sample: 1 + max_trials: 1 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + min_jaccard_overlap: 0.1 + } + max_sample: 1 + max_trials: 50 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + min_jaccard_overlap: 0.3 + } + max_sample: 1 + max_trials: 50 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + min_jaccard_overlap: 0.5 + } + max_sample: 1 + max_trials: 50 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + min_jaccard_overlap: 0.7 + } + max_sample: 1 + max_trials: 50 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + min_jaccard_overlap: 0.9 + } + max_sample: 1 + max_trials: 50 + } + batch_sampler { + sampler { + min_scale: 0.3 + max_scale: 1.0 + min_aspect_ratio: 0.5 + max_aspect_ratio: 2.0 + } + sample_constraint { + max_jaccard_overlap: 1.0 + } + max_sample: 1 + max_trials: 50 + } + } +} + +layer { + name: "data_bn" + type: "BatchNorm" + bottom: "data" + top: "data_bn" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "data_scale" + type: "Scale" + bottom: "data_bn" + top: "data_bn" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_h" + type: "Convolution" + bottom: "data_bn" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + pad: 3 + kernel_size: 7 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "conv1_bn_h" + type: "BatchNorm" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "conv1_scale_h" + type: "Scale" + bottom: "conv1_h" + top: "conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "conv1_relu" + type: "ReLU" + bottom: "conv1_h" + top: "conv1_h" +} +layer { + name: "conv1_pool" + type: "Pooling" + bottom: "conv1_h" + top: "conv1_pool" + pooling_param { + kernel_size: 3 + stride: 2 + } +} +layer { + name: "layer_64_1_conv1_h" + type: "Convolution" + bottom: "conv1_pool" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_bn2_h" + type: "BatchNorm" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_64_1_scale2_h" + type: "Scale" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_64_1_relu2" + type: "ReLU" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv1_h" +} +layer { + name: "layer_64_1_conv2_h" + type: "Convolution" + bottom: "layer_64_1_conv1_h" + top: "layer_64_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_64_1_sum" + type: "Eltwise" + bottom: "layer_64_1_conv2_h" + bottom: "conv1_pool" + top: "layer_64_1_sum" +} +layer { + name: "layer_128_1_bn1_h" + type: "BatchNorm" + bottom: "layer_64_1_sum" + top: "layer_128_1_bn1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale1_h" + type: "Scale" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu1" + type: "ReLU" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_bn1_h" +} +layer { + name: "layer_128_1_conv1_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_bn2" + type: "BatchNorm" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_128_1_scale2" + type: "Scale" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_128_1_relu2" + type: "ReLU" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv1_h" +} +layer { + name: "layer_128_1_conv2" + type: "Convolution" + bottom: "layer_128_1_conv1_h" + top: "layer_128_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_conv_expand_h" + type: "Convolution" + bottom: "layer_128_1_bn1_h" + top: "layer_128_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_128_1_sum" + type: "Eltwise" + bottom: "layer_128_1_conv2" + bottom: "layer_128_1_conv_expand_h" + top: "layer_128_1_sum" +} +layer { + name: "layer_256_1_bn1" + type: "BatchNorm" + bottom: "layer_128_1_sum" + top: "layer_256_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale1" + type: "Scale" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu1" + type: "ReLU" + bottom: "layer_256_1_bn1" + top: "layer_256_1_bn1" +} +layer { + name: "layer_256_1_conv1" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_bn2" + type: "BatchNorm" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_256_1_scale2" + type: "Scale" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_256_1_relu2" + type: "ReLU" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv1" +} +layer { + name: "layer_256_1_conv2" + type: "Convolution" + bottom: "layer_256_1_conv1" + top: "layer_256_1_conv2" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_conv_expand" + type: "Convolution" + bottom: "layer_256_1_bn1" + top: "layer_256_1_conv_expand" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_256_1_sum" + type: "Eltwise" + bottom: "layer_256_1_conv2" + bottom: "layer_256_1_conv_expand" + top: "layer_256_1_sum" +} +layer { + name: "layer_512_1_bn1" + type: "BatchNorm" + bottom: "layer_256_1_sum" + top: "layer_512_1_bn1" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale1" + type: "Scale" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu1" + type: "ReLU" + bottom: "layer_512_1_bn1" + top: "layer_512_1_bn1" +} +layer { + name: "layer_512_1_conv1_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_bn2_h" + type: "BatchNorm" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "layer_512_1_scale2_h" + type: "Scale" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "layer_512_1_relu2" + type: "ReLU" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv1_h" +} +layer { + name: "layer_512_1_conv2_h" + type: "Convolution" + bottom: "layer_512_1_conv1_h" + top: "layer_512_1_conv2_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 2 # 1 + kernel_size: 3 + stride: 1 + dilation: 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_conv_expand_h" + type: "Convolution" + bottom: "layer_512_1_bn1" + top: "layer_512_1_conv_expand_h" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + convolution_param { + num_output: 256 + bias_term: false + pad: 0 + kernel_size: 1 + stride: 1 # 2 + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0.0 + } + } +} +layer { + name: "layer_512_1_sum" + type: "Eltwise" + bottom: "layer_512_1_conv2_h" + bottom: "layer_512_1_conv_expand_h" + top: "layer_512_1_sum" +} +layer { + name: "last_bn_h" + type: "BatchNorm" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } + param { + lr_mult: 0.0 + } +} +layer { + name: "last_scale_h" + type: "Scale" + bottom: "layer_512_1_sum" + top: "layer_512_1_sum" + param { + lr_mult: 1.0 + decay_mult: 1.0 + } + param { + lr_mult: 2.0 + decay_mult: 1.0 + } + scale_param { + bias_term: true + } +} +layer { + name: "last_relu" + type: "ReLU" + bottom: "layer_512_1_sum" + top: "fc7" +} + +layer { + name: "conv6_1_h" + type: "Convolution" + bottom: "fc7" + top: "conv6_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1_h" + top: "conv6_1_h" +} +layer { + name: "conv6_2_h" + type: "Convolution" + bottom: "conv6_1_h" + top: "conv6_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2_h" + top: "conv6_2_h" +} +layer { + name: "conv7_1_h" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv7_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1_h" + top: "conv7_1_h" +} +layer { + name: "conv7_2_h" + type: "Convolution" + bottom: "conv7_1_h" + top: "conv7_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2_h" + top: "conv7_2_h" +} +layer { + name: "conv8_1_h" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv8_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1_h" + top: "conv8_1_h" +} +layer { + name: "conv8_2_h" + type: "Convolution" + bottom: "conv8_1_h" + top: "conv8_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2_h" + top: "conv8_2_h" +} +layer { + name: "conv9_1_h" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv9_1_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_1_relu" + type: "ReLU" + bottom: "conv9_1_h" + top: "conv9_1_h" +} +layer { + name: "conv9_2_h" + type: "Convolution" + bottom: "conv9_1_h" + top: "conv9_2_h" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_relu" + type: "ReLU" + bottom: "conv9_2_h" + top: "conv9_2_h" +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "layer_256_1_bn1" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + max_size: 60.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 8 + offset: 0.5 + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 111.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 16 + offset: 0.5 + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2_h" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2_h" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 111.0 + max_size: 162.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 32 + offset: 0.5 + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2_h" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 12 # 126 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2_h" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 162.0 + max_size: 213.0 + aspect_ratio: 2 + aspect_ratio: 3 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 64 + offset: 0.5 + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2_h" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2_h" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 213.0 + max_size: 264.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 100 + offset: 0.5 + } +} +layer { + name: "conv9_2_mbox_loc" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 16 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_loc_perm" + type: "Permute" + bottom: "conv9_2_mbox_loc" + top: "conv9_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv9_2_mbox_loc_perm" + top: "conv9_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_conf" + type: "Convolution" + bottom: "conv9_2_h" + top: "conv9_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 8 # 84 + pad: 1 + kernel_size: 3 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv9_2_mbox_conf_perm" + type: "Permute" + bottom: "conv9_2_mbox_conf" + top: "conv9_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv9_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv9_2_mbox_conf_perm" + top: "conv9_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv9_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv9_2_h" + bottom: "data" + top: "conv9_2_mbox_priorbox" + prior_box_param { + min_size: 264.0 + max_size: 315.0 + aspect_ratio: 2 + flip: true + clip: false + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + step: 300 + offset: 0.5 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "conv9_2_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "conv9_2_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "conv9_2_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} + +layer { + name: "mbox_loss" + type: "MultiBoxLoss" + bottom: "mbox_loc" + bottom: "mbox_conf" + bottom: "mbox_priorbox" + bottom: "label" + top: "mbox_loss" + include { + phase: TRAIN + } + propagate_down: true + propagate_down: true + propagate_down: false + propagate_down: false + loss_param { + normalization: VALID + } + multibox_loss_param { + loc_loss_type: SMOOTH_L1 + conf_loss_type: SOFTMAX + loc_weight: 1.0 + num_classes: 2 # 21 + share_location: true + match_type: PER_PREDICTION + overlap_threshold: 0.5 + use_prior_for_matching: true + background_label_id: 0 + use_difficult_gt: true + neg_pos_ratio: 3.0 + neg_overlap: 0.5 + code_type: CENTER_SIZE + ignore_cross_boundary_bbox: false + mining_type: MAX_NEGATIVE + } +} \ No newline at end of file diff --git a/samples/dnn/resnet_ssd_face_python.py b/samples/dnn/resnet_ssd_face_python.py new file mode 100644 index 0000000000..d8a102a3cd --- /dev/null +++ b/samples/dnn/resnet_ssd_face_python.py @@ -0,0 +1,59 @@ +import numpy as np +import argparse +import os +import sys +sys.path.append('/home/arrybn/build/opencv/lib') +import cv2 as cv +try: + import cv2 as cv +except ImportError: + raise ImportError('Can\'t find OpenCV Python module. If you\'ve built it from sources without installation, ' + 'configure environemnt variable PYTHONPATH to "opencv_build_dir/lib" directory (with "python3" subdirectory if required)') + +from cv2 import dnn + +inWidth = 300 +inHeight = 300 +confThreshold = 0.5 + +prototxt = 'face_detector/deploy.prototxt' +caffemodel = 'face_detector/res10_300x300_ssd_iter_140000.caffemodel' + +if __name__ == '__main__': + net = dnn.readNetFromCaffe(prototxt, caffemodel) + cap = cv.VideoCapture(0) + while True: + ret, frame = cap.read() + cols = frame.shape[1] + rows = frame.shape[0] + + net.setInput(dnn.blobFromImage(cv.resize(frame, (inWidth, inHeight)), + 1.0, (inWidth, inHeight), (104., 177., 123.))) + detections = net.forward() + + perf_stats = net.getPerfProfile() + + print('Inference time, ms: %.2f' % (perf_stats[0] / cv.getTickFrequency() * 1000)) + + for i in range(detections.shape[2]): + confidence = detections[0, 0, i, 2] + if confidence > confThreshold: + xLeftBottom = int(detections[0, 0, i, 3] * cols) + yLeftBottom = int(detections[0, 0, i, 4] * rows) + xRightTop = int(detections[0, 0, i, 5] * cols) + yRightTop = int(detections[0, 0, i, 6] * rows) + + cv.rectangle(frame, (xLeftBottom, yLeftBottom), (xRightTop, yRightTop), + (0, 255, 0)) + label = "face: %.4f" % confidence + labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) + + cv.rectangle(frame, (xLeftBottom, yLeftBottom - labelSize[1]), + (xLeftBottom + labelSize[0], yLeftBottom + baseLine), + (255, 255, 255), cv.FILLED) + cv.putText(frame, label, (xLeftBottom, yLeftBottom), + cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) + + cv.imshow("detections", frame) + if cv.waitKey(1) != -1: + break