diff --git a/samples/dnn/human_parsing.py b/samples/dnn/human_parsing.py index 467a19a3b1..43c495200a 100644 --- a/samples/dnn/human_parsing.py +++ b/samples/dnn/human_parsing.py @@ -3,8 +3,8 @@ import numpy as np import argparse -backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, - cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE) +backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, + cv.dnn.DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, cv.dnn.DNN_BACKEND_OPENCV) targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD) parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet', @@ -36,26 +36,27 @@ parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, # 2. Create input # image = cv2.imread(path/to/image) # image_rev = np.flip(image, axis=1) -# image_h, image_w = image.shape[:2] # input = np.stack([image, image_rev], axis=0) # -# 3. Hardcode image_h and image_w shapes to determine output shapes -# - parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, [image_h, image_w]), -# tf.image.resize_images(parsing_out1_075, [image_h, image_w]), -# tf.image.resize_images(parsing_out1_125, [image_h, image_w])]), axis=0) -# Do similarly with parsing_out2, parsing_out3 -# 4. Remove postprocessing -# - parsing_ = sess.run(raw_output, feed_dict={'input:0': input}) +# 3. Hardcode image_h and image_w shapes to determine output shapes. +# We use default INPUT_SIZE = (384, 384) from evaluate_parsing_JPPNet-s2.py. +# - parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, INPUT_SIZE), +# tf.image.resize_images(parsing_out1_075, INPUT_SIZE), +# tf.image.resize_images(parsing_out1_125, INPUT_SIZE)]), axis=0) +# Do similarly with parsing_out2, parsing_out3 +# 4. Remove postprocessing. Last net operation: +# raw_output = tf.reduce_mean(tf.stack([parsing_out1, parsing_out2, parsing_out3]), axis=0) +# Change: +# parsing_ = sess.run(raw_output, feed_dict={'input:0': input}) # # 5. To save model after sess.run(...) add: -# - input_graph_def = tf.get_default_graph().as_graph_def() -# - output_node = "Mean_3" -# - output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node) -# - -# - output_graph = "LIP_JPPNet.pb" -# - with tf.gfile.GFile(output_graph, "wb") as f: -# - f.write(output_graph_def.SerializeToString()) - +# input_graph_def = tf.get_default_graph().as_graph_def() +# output_node = "Mean_3" +# output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node) +# +# output_graph = "LIP_JPPNet.pb" +# with tf.gfile.GFile(output_graph, "wb") as f: +# f.write(output_graph_def.SerializeToString()) def preprocess(image_path): @@ -73,6 +74,8 @@ def run_net(input, model_path, backend, target): """ Read network and infer model :param model_path: path to JPPNet model + :param backend: computation backend + :param target: computation device """ net = cv.dnn.readNet(model_path) net.setPreferableBackend(backend) @@ -82,10 +85,11 @@ def run_net(input, model_path, backend, target): return out -def postprocess(out): +def postprocess(out, input_shape): """ Create a grayscale human segmentation :param out: network output + :param input_shape: input image width and height """ # LIP classes # 0 Background @@ -111,6 +115,10 @@ def postprocess(out): head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0) head_output = head_output.squeeze(0) tail_output = tail_output.squeeze(0) + + head_output = np.stack([cv.resize(img, dsize=input_shape) for img in head_output[:, ...]]) + tail_output = np.stack([cv.resize(img, dsize=input_shape) for img in tail_output[:, ...]]) + tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0) tail_list = [arr.squeeze(0) for arr in tail_list] tail_list_rev = [tail_list[i] for i in range(14)] @@ -149,8 +157,9 @@ def parse_human(image_path, model_path, backend, target): :param target: name of computation target """ input = preprocess(image_path) + input_h, input_w = input.shape[2:] output = run_net(input, model_path, backend, target) - grayscale_out = postprocess(output) + grayscale_out = postprocess(output, (input_w, input_h)) segmentation = decode_labels(grayscale_out) return segmentation