Merge pull request #12243 from dkurt:dnn_tf_mask_rcnn
* Support Mask-RCNN from TensorFlow * Fix a samplepull/12308/head
parent
4f360f8b1a
commit
472b71ecef
9 changed files with 600 additions and 153 deletions
@ -0,0 +1,143 @@ |
||||
import cv2 as cv |
||||
import argparse |
||||
import numpy as np |
||||
|
||||
parser = argparse.ArgumentParser(description= |
||||
'Use this script to run Mask-RCNN object detection and semantic ' |
||||
'segmentation network from TensorFlow Object Detection API.') |
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.') |
||||
parser.add_argument('--model', required=True, help='Path to a .pb file with weights.') |
||||
parser.add_argument('--config', required=True, help='Path to a .pxtxt file contains network configuration.') |
||||
parser.add_argument('--classes', help='Optional path to a text file with names of classes.') |
||||
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. ' |
||||
'An every color is represented with three values from 0 to 255 in BGR channels order.') |
||||
parser.add_argument('--width', type=int, default=800, |
||||
help='Preprocess input image by resizing to a specific width.') |
||||
parser.add_argument('--height', type=int, default=800, |
||||
help='Preprocess input image by resizing to a specific height.') |
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold') |
||||
args = parser.parse_args() |
||||
|
||||
np.random.seed(324) |
||||
|
||||
# Load names of classes |
||||
classes = None |
||||
if args.classes: |
||||
with open(args.classes, 'rt') as f: |
||||
classes = f.read().rstrip('\n').split('\n') |
||||
|
||||
# Load colors |
||||
colors = None |
||||
if args.colors: |
||||
with open(args.colors, 'rt') as f: |
||||
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')] |
||||
|
||||
legend = None |
||||
def showLegend(classes): |
||||
global legend |
||||
if not classes is None and legend is None: |
||||
blockHeight = 30 |
||||
assert(len(classes) == len(colors)) |
||||
|
||||
legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8) |
||||
for i in range(len(classes)): |
||||
block = legend[i * blockHeight:(i + 1) * blockHeight] |
||||
block[:,:] = colors[i] |
||||
cv.putText(block, classes[i], (0, blockHeight/2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255)) |
||||
|
||||
cv.namedWindow('Legend', cv.WINDOW_NORMAL) |
||||
cv.imshow('Legend', legend) |
||||
classes = None |
||||
|
||||
|
||||
def drawBox(frame, classId, conf, left, top, right, bottom): |
||||
# Draw a bounding box. |
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0)) |
||||
|
||||
label = '%.2f' % conf |
||||
|
||||
# Print a label of class. |
||||
if classes: |
||||
assert(classId < len(classes)) |
||||
label = '%s: %s' % (classes[classId], label) |
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1) |
||||
top = max(top, labelSize[1]) |
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED) |
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) |
||||
|
||||
|
||||
# Load a network |
||||
net = cv.dnn.readNet(args.model, args.config) |
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV) |
||||
|
||||
winName = 'Mask-RCNN in OpenCV' |
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL) |
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0) |
||||
legend = None |
||||
while cv.waitKey(1) < 0: |
||||
hasFrame, frame = cap.read() |
||||
if not hasFrame: |
||||
cv.waitKey() |
||||
break |
||||
|
||||
frameH = frame.shape[0] |
||||
frameW = frame.shape[1] |
||||
|
||||
# Create a 4D blob from a frame. |
||||
blob = cv.dnn.blobFromImage(frame, size=(args.width, args.height), swapRB=True, crop=False) |
||||
|
||||
# Run a model |
||||
net.setInput(blob) |
||||
|
||||
boxes, masks = net.forward(['detection_out_final', 'detection_masks']) |
||||
|
||||
numClasses = masks.shape[1] |
||||
numDetections = boxes.shape[2] |
||||
|
||||
# Draw segmentation |
||||
if not colors: |
||||
# Generate colors |
||||
colors = [np.array([0, 0, 0], np.uint8)] |
||||
for i in range(1, numClasses + 1): |
||||
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2) |
||||
del colors[0] |
||||
|
||||
boxesToDraw = [] |
||||
for i in range(numDetections): |
||||
box = boxes[0, 0, i] |
||||
mask = masks[i] |
||||
score = box[2] |
||||
if score > args.thr: |
||||
classId = int(box[1]) |
||||
left = int(frameW * box[3]) |
||||
top = int(frameH * box[4]) |
||||
right = int(frameW * box[5]) |
||||
bottom = int(frameH * box[6]) |
||||
|
||||
left = max(0, min(left, frameW - 1)) |
||||
top = max(0, min(top, frameH - 1)) |
||||
right = max(0, min(right, frameW - 1)) |
||||
bottom = max(0, min(bottom, frameH - 1)) |
||||
|
||||
boxesToDraw.append([frame, classId, score, left, top, right, bottom]) |
||||
|
||||
classMask = mask[classId] |
||||
classMask = cv.resize(classMask, (right - left + 1, bottom - top + 1)) |
||||
mask = (classMask > 0.5) |
||||
|
||||
roi = frame[top:bottom+1, left:right+1][mask] |
||||
frame[top:bottom+1, left:right+1][mask] = (0.7 * colors[classId] + 0.3 * roi).astype(np.uint8) |
||||
|
||||
for box in boxesToDraw: |
||||
drawBox(*box) |
||||
|
||||
# Put efficiency information. |
||||
t, _ = net.getPerfProfile() |
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) |
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) |
||||
|
||||
showLegend(classes) |
||||
|
||||
cv.imshow(winName, frame) |
@ -0,0 +1,230 @@ |
||||
import argparse |
||||
import numpy as np |
||||
import tensorflow as tf |
||||
|
||||
from tensorflow.core.framework.node_def_pb2 import NodeDef |
||||
from tensorflow.tools.graph_transforms import TransformGraph |
||||
from google.protobuf import text_format |
||||
|
||||
from tf_text_graph_common import * |
||||
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of ' |
||||
'Mask-RCNN model from TensorFlow Object Detection API. ' |
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.') |
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.') |
||||
parser.add_argument('--output', required=True, help='Path to output text graph.') |
||||
parser.add_argument('--num_classes', default=90, type=int, help='Number of trained classes.') |
||||
parser.add_argument('--scales', default=[0.25, 0.5, 1.0, 2.0], type=float, nargs='+', |
||||
help='Hyper-parameter of grid_anchor_generator from a config file.') |
||||
parser.add_argument('--aspect_ratios', default=[0.5, 1.0, 2.0], type=float, nargs='+', |
||||
help='Hyper-parameter of grid_anchor_generator from a config file.') |
||||
parser.add_argument('--features_stride', default=16, type=float, nargs='+', |
||||
help='Hyper-parameter from a config file.') |
||||
args = parser.parse_args() |
||||
|
||||
scopesToKeep = ('FirstStageFeatureExtractor', 'Conv', |
||||
'FirstStageBoxPredictor/BoxEncodingPredictor', |
||||
'FirstStageBoxPredictor/ClassPredictor', |
||||
'CropAndResize', |
||||
'MaxPool2D', |
||||
'SecondStageFeatureExtractor', |
||||
'SecondStageBoxPredictor', |
||||
'Preprocessor/sub', |
||||
'Preprocessor/mul', |
||||
'image_tensor') |
||||
|
||||
scopesToIgnore = ('FirstStageFeatureExtractor/Assert', |
||||
'FirstStageFeatureExtractor/Shape', |
||||
'FirstStageFeatureExtractor/strided_slice', |
||||
'FirstStageFeatureExtractor/GreaterEqual', |
||||
'FirstStageFeatureExtractor/LogicalAnd') |
||||
|
||||
|
||||
# Read the graph. |
||||
with tf.gfile.FastGFile(args.input, 'rb') as f: |
||||
graph_def = tf.GraphDef() |
||||
graph_def.ParseFromString(f.read()) |
||||
|
||||
removeIdentity(graph_def) |
||||
|
||||
def to_remove(name, op): |
||||
return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) |
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def) |
||||
|
||||
|
||||
# Connect input node to the first layer |
||||
assert(graph_def.node[0].op == 'Placeholder') |
||||
graph_def.node[1].input.insert(0, graph_def.node[0].name) |
||||
|
||||
# Temporarily remove top nodes. |
||||
topNodes = [] |
||||
numCropAndResize = 0 |
||||
while True: |
||||
node = graph_def.node.pop() |
||||
topNodes.append(node) |
||||
if node.op == 'CropAndResize': |
||||
numCropAndResize += 1 |
||||
if numCropAndResize == 2: |
||||
break |
||||
|
||||
addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd', |
||||
'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2], graph_def) |
||||
|
||||
addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1', |
||||
'FirstStageBoxPredictor/ClassPredictor/softmax', graph_def) # Compare with Reshape_4 |
||||
|
||||
addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax', |
||||
'FirstStageBoxPredictor/ClassPredictor/softmax/flatten', graph_def) |
||||
|
||||
# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd |
||||
addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd', |
||||
'FirstStageBoxPredictor/BoxEncodingPredictor/flatten', graph_def) |
||||
|
||||
proposals = NodeDef() |
||||
proposals.name = 'proposals' # Compare with ClipToWindow/Gather/Gather (NOTE: normalized) |
||||
proposals.op = 'PriorBox' |
||||
proposals.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd') |
||||
proposals.input.append(graph_def.node[0].name) # image_tensor |
||||
|
||||
text_format.Merge('b: false', proposals.attr["flip"]) |
||||
text_format.Merge('b: true', proposals.attr["clip"]) |
||||
text_format.Merge('f: %f' % args.features_stride, proposals.attr["step"]) |
||||
text_format.Merge('f: 0.0', proposals.attr["offset"]) |
||||
text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), proposals.attr["variance"]) |
||||
|
||||
widths = [] |
||||
heights = [] |
||||
for a in args.aspect_ratios: |
||||
for s in args.scales: |
||||
ar = np.sqrt(a) |
||||
heights.append((args.features_stride**2) * s / ar) |
||||
widths.append((args.features_stride**2) * s * ar) |
||||
|
||||
text_format.Merge(tensorMsg(widths), proposals.attr["width"]) |
||||
text_format.Merge(tensorMsg(heights), proposals.attr["height"]) |
||||
|
||||
graph_def.node.extend([proposals]) |
||||
|
||||
# Compare with Reshape_5 |
||||
detectionOut = NodeDef() |
||||
detectionOut.name = 'detection_out' |
||||
detectionOut.op = 'DetectionOutput' |
||||
|
||||
detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten') |
||||
detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten') |
||||
detectionOut.input.append('proposals') |
||||
|
||||
text_format.Merge('i: 2', detectionOut.attr['num_classes']) |
||||
text_format.Merge('b: true', detectionOut.attr['share_location']) |
||||
text_format.Merge('i: 0', detectionOut.attr['background_label_id']) |
||||
text_format.Merge('f: 0.7', detectionOut.attr['nms_threshold']) |
||||
text_format.Merge('i: 6000', detectionOut.attr['top_k']) |
||||
text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type']) |
||||
text_format.Merge('i: 100', detectionOut.attr['keep_top_k']) |
||||
text_format.Merge('b: true', detectionOut.attr['clip']) |
||||
|
||||
graph_def.node.extend([detectionOut]) |
||||
|
||||
# Save as text. |
||||
for node in reversed(topNodes): |
||||
if node.op != 'CropAndResize': |
||||
graph_def.node.extend([node]) |
||||
topNodes.pop() |
||||
else: |
||||
if numCropAndResize == 1: |
||||
break |
||||
else: |
||||
graph_def.node.extend([node]) |
||||
topNodes.pop() |
||||
numCropAndResize -= 1 |
||||
|
||||
addSoftMax('SecondStageBoxPredictor/Reshape_1', 'SecondStageBoxPredictor/Reshape_1/softmax', graph_def) |
||||
|
||||
addSlice('SecondStageBoxPredictor/Reshape_1/softmax', |
||||
'SecondStageBoxPredictor/Reshape_1/slice', |
||||
[0, 0, 1], [-1, -1, -1], graph_def) |
||||
|
||||
addReshape('SecondStageBoxPredictor/Reshape_1/slice', |
||||
'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def) |
||||
|
||||
# Replace Flatten subgraph onto a single node. |
||||
for i in reversed(range(len(graph_def.node))): |
||||
if graph_def.node[i].op == 'CropAndResize': |
||||
graph_def.node[i].input.insert(1, 'detection_out') |
||||
|
||||
if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape': |
||||
addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def) |
||||
|
||||
graph_def.node[i].input.pop() |
||||
graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2') |
||||
|
||||
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape', |
||||
'SecondStageBoxPredictor/Flatten/flatten/strided_slice', |
||||
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape']: |
||||
del graph_def.node[i] |
||||
|
||||
for node in graph_def.node: |
||||
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape': |
||||
node.op = 'Flatten' |
||||
node.input.pop() |
||||
|
||||
if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D', |
||||
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']: |
||||
text_format.Merge('b: true', node.attr["loc_pred_transposed"]) |
||||
|
||||
################################################################################ |
||||
### Postprocessing |
||||
################################################################################ |
||||
addSlice('detection_out', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4], graph_def) |
||||
|
||||
variance = NodeDef() |
||||
variance.name = 'proposals/variance' |
||||
variance.op = 'Const' |
||||
text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), variance.attr["value"]) |
||||
graph_def.node.extend([variance]) |
||||
|
||||
varianceEncoder = NodeDef() |
||||
varianceEncoder.name = 'variance_encoded' |
||||
varianceEncoder.op = 'Mul' |
||||
varianceEncoder.input.append('SecondStageBoxPredictor/Reshape') |
||||
varianceEncoder.input.append(variance.name) |
||||
text_format.Merge('i: 2', varianceEncoder.attr["axis"]) |
||||
graph_def.node.extend([varianceEncoder]) |
||||
|
||||
addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1], graph_def) |
||||
addFlatten('variance_encoded', 'variance_encoded/flatten', graph_def) |
||||
|
||||
detectionOut = NodeDef() |
||||
detectionOut.name = 'detection_out_final' |
||||
detectionOut.op = 'DetectionOutput' |
||||
|
||||
detectionOut.input.append('variance_encoded/flatten') |
||||
detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape') |
||||
detectionOut.input.append('detection_out/slice/reshape') |
||||
|
||||
text_format.Merge('i: %d' % args.num_classes, detectionOut.attr['num_classes']) |
||||
text_format.Merge('b: false', detectionOut.attr['share_location']) |
||||
text_format.Merge('i: %d' % (args.num_classes + 1), detectionOut.attr['background_label_id']) |
||||
text_format.Merge('f: 0.6', detectionOut.attr['nms_threshold']) |
||||
text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type']) |
||||
text_format.Merge('i: 100', detectionOut.attr['keep_top_k']) |
||||
text_format.Merge('b: true', detectionOut.attr['clip']) |
||||
text_format.Merge('b: true', detectionOut.attr['variance_encoded_in_target']) |
||||
text_format.Merge('f: 0.3', detectionOut.attr['confidence_threshold']) |
||||
text_format.Merge('b: false', detectionOut.attr['group_by_classes']) |
||||
graph_def.node.extend([detectionOut]) |
||||
|
||||
for node in reversed(topNodes): |
||||
graph_def.node.extend([node]) |
||||
|
||||
for i in reversed(range(len(graph_def.node))): |
||||
if graph_def.node[i].op == 'CropAndResize': |
||||
graph_def.node[i].input.insert(1, 'detection_out_final') |
||||
break |
||||
|
||||
graph_def.node[-1].name = 'detection_masks' |
||||
graph_def.node[-1].op = 'Sigmoid' |
||||
graph_def.node[-1].input.pop() |
||||
|
||||
tf.train.write_graph(graph_def, "", args.output, as_text=True) |
Loading…
Reference in new issue