From d70053aba528ec5ddf8d6906109c7cc9310654a5 Mon Sep 17 00:00:00 2001 From: Maxim Pashchenkov <maxim.pashchenkov@intel.com> Date: Thu, 1 Jul 2021 13:27:28 +0300 Subject: [PATCH] Merge pull request #20144 from mpashchenkov:mp/python-ge G-API: Python. Gaze Estimation sample. * GE pep8 * Added function description, wrapped copy * Applying review comments * One more change * Added gin * Rstrt bb --- .../misc/python/samples/gaze_estimation.py | 467 ++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 modules/gapi/misc/python/samples/gaze_estimation.py diff --git a/modules/gapi/misc/python/samples/gaze_estimation.py b/modules/gapi/misc/python/samples/gaze_estimation.py new file mode 100644 index 0000000000..db190f67bb --- /dev/null +++ b/modules/gapi/misc/python/samples/gaze_estimation.py @@ -0,0 +1,467 @@ +import argparse +import time +import numpy as np +import cv2 as cv + +# ------------------------Service operations------------------------ +def weight_path(model_path): + """ Get path of weights based on path to IR + + Params: + model_path: the string contains path to IR file + + Return: + Path to weights file + """ + assert model_path.endswith('.xml'), "Wrong topology path was provided" + return model_path[:-3] + 'bin' + + +def build_argparser(): + """ Parse arguments from command line + + Return: + Pack of arguments from command line + """ + parser = argparse.ArgumentParser(description='This is an OpenCV-based version of Gaze Estimation example') + + parser.add_argument('--input', + help='Path to the input video file') + parser.add_argument('--out', + help='Path to the output video file') + parser.add_argument('--facem', + default='face-detection-retail-0005.xml', + help='Path to OpenVINO face detection model (.xml)') + parser.add_argument('--faced', + default='CPU', + help='Target device for the face detection' + + '(e.g. CPU, GPU, VPU, ...)') + parser.add_argument('--headm', + default='head-pose-estimation-adas-0001.xml', + help='Path to OpenVINO head pose estimation model (.xml)') + parser.add_argument('--headd', + default='CPU', + help='Target device for the head pose estimation inference ' + + '(e.g. CPU, GPU, VPU, ...)') + parser.add_argument('--landm', + default='facial-landmarks-35-adas-0002.xml', + help='Path to OpenVINO landmarks detector model (.xml)') + parser.add_argument('--landd', + default='CPU', + help='Target device for the landmarks detector (e.g. CPU, GPU, VPU, ...)') + parser.add_argument('--gazem', + default='gaze-estimation-adas-0002.xml', + help='Path to OpenVINO gaze vector estimaiton model (.xml)') + parser.add_argument('--gazed', + default='CPU', + help='Target device for the gaze vector estimation inference ' + + '(e.g. CPU, GPU, VPU, ...)') + parser.add_argument('--eyem', + default='open-closed-eye-0001.xml', + help='Path to OpenVINO open closed eye model (.xml)') + parser.add_argument('--eyed', + default='CPU', + help='Target device for the eyes state inference (e.g. CPU, GPU, VPU, ...)') + return parser + + +# ------------------------Support functions for custom kernels------------------------ +def intersection(surface, rect): + """ Remove zone of out of bound from ROI + + Params: + surface: image bounds is rect representation (top left coordinates and width and height) + rect: region of interest is also has rect representation + + Return: + Modified ROI with correct bounds + """ + l_x = max(surface[0], rect[0]) + l_y = max(surface[1], rect[1]) + width = min(surface[0] + surface[2], rect[0] + rect[2]) - l_x + height = min(surface[1] + surface[3], rect[1] + rect[3]) - l_y + if width < 0 or height < 0: + return (0, 0, 0, 0) + return (l_x, l_y, width, height) + + +def process_landmarks(r_x, r_y, r_w, r_h, landmarks): + """ Create points from result of inference of facial-landmarks network and size of input image + + Params: + r_x: x coordinate of top left corner of input image + r_y: y coordinate of top left corner of input image + r_w: width of input image + r_h: height of input image + landmarks: result of inference of facial-landmarks network + + Return: + Array of landmarks points for one face + """ + lmrks = landmarks[0] + raw_x = lmrks[::2] * r_w + r_x + raw_y = lmrks[1::2] * r_h + r_y + return np.array([[int(x), int(y)] for x, y in zip(raw_x, raw_y)]) + + +def eye_box(p_1, p_2, scale=1.8): + """ Get bounding box of eye + + Params: + p_1: point of left edge of eye + p_2: point of right edge of eye + scale: change size of box with this value + + Return: + Bounding box of eye and its midpoint + """ + + size = np.linalg.norm(p_1 - p_2) + midpoint = (p_1 + p_2) / 2 + width = scale * size + height = width + p_x = midpoint[0] - (width / 2) + p_y = midpoint[1] - (height / 2) + return (int(p_x), int(p_y), int(width), int(height)), list(map(int, midpoint)) + + +# ------------------------Custom graph operations------------------------ +@cv.gapi.op('custom.GProcessPoses', + in_types=[cv.GArray.GMat, cv.GArray.GMat, cv.GArray.GMat], + out_types=[cv.GArray.GMat]) +class GProcessPoses: + @staticmethod + def outMeta(arr_desc0, arr_desc1, arr_desc2): + return cv.empty_array_desc() + + +@cv.gapi.op('custom.GParseEyes', + in_types=[cv.GArray.GMat, cv.GArray.Rect, cv.GOpaque.Size], + out_types=[cv.GArray.Rect, cv.GArray.Rect, cv.GArray.Point, cv.GArray.Point]) +class GParseEyes: + @staticmethod + def outMeta(arr_desc0, arr_desc1, arr_desc2): + return cv.empty_array_desc(), cv.empty_array_desc(), \ + cv.empty_array_desc(), cv.empty_array_desc() + + +@cv.gapi.op('custom.GGetStates', + in_types=[cv.GArray.GMat, cv.GArray.GMat], + out_types=[cv.GArray.Int, cv.GArray.Int]) +class GGetStates: + @staticmethod + def outMeta(arr_desc0, arr_desc1): + return cv.empty_array_desc(), cv.empty_array_desc() + + +# ------------------------Custom kernels------------------------ +@cv.gapi.kernel(GProcessPoses) +class GProcessPosesImpl: + """ Custom kernel. Processed poses of heads + """ + @staticmethod + def run(in_ys, in_ps, in_rs): + """ Сustom kernel executable code + + Params: + in_ys: yaw angle of head + in_ps: pitch angle of head + in_rs: roll angle of head + + Return: + Arrays with heads poses + """ + out_poses = [] + size = len(in_ys) + for i in range(size): + out_poses.append(np.array([in_ys[i][0], in_ps[i][0], in_rs[i][0]]).T) + return out_poses + + +@cv.gapi.kernel(GParseEyes) +class GParseEyesImpl: + """ Custom kernel. Get information about eyes + """ + @staticmethod + def run(in_landm_per_face, in_face_rcs, frame_size): + """ Сustom kernel executable code + + Params: + in_landm_per_face: landmarks from inference of facial-landmarks network for each face + in_face_rcs: bounding boxes for each face + frame_size: size of input image + + Return: + Arrays of ROI for left and right eyes, array of midpoints and + array of landmarks points + """ + left_eyes = [] + right_eyes = [] + midpoints = [] + lmarks = [] + num_faces = len(in_landm_per_face) + surface = (0, 0, *frame_size) + for i in range(num_faces): + rect = in_face_rcs[i] + points = process_landmarks(*rect, in_landm_per_face[i]) + for p in points: + lmarks.append(p) + size = int(len(in_landm_per_face[i][0]) / 2) + + rect, midpoint_l = eye_box(lmarks[0 + i * size], lmarks[1 + i * size]) + left_eyes.append(intersection(surface, rect)) + rect, midpoint_r = eye_box(lmarks[2 + i * size], lmarks[3 + i * size]) + right_eyes.append(intersection(surface, rect)) + midpoints += [midpoint_l, midpoint_r] + return left_eyes, right_eyes, midpoints, lmarks + + +@cv.gapi.kernel(GGetStates) +class GGetStatesImpl: + """ Custom kernel. Get state of eye - open or closed + """ + @staticmethod + def run(eyesl, eyesr): + """ Сustom kernel executable code + + Params: + eyesl: result of inference of open-closed-eye network for left eye + eyesr: result of inference of open-closed-eye network for right eye + + Return: + States of left eyes and states of right eyes + """ + size = len(eyesl) + out_l_st = [] + out_r_st = [] + for i in range(size): + for st in eyesl[i]: + out_l_st += [1 if st[0] < st[1] else 0] + for st in eyesr[i]: + out_r_st += [1 if st[0] < st[1] else 0] + return out_l_st, out_r_st + + +if __name__ == '__main__': + ARGUMENTS = build_argparser().parse_args() + + # ------------------------Demo's graph------------------------ + g_in = cv.GMat() + + # Detect faces + face_inputs = cv.GInferInputs() + face_inputs.setInput('data', g_in) + face_outputs = cv.gapi.infer('face-detection', face_inputs) + faces = face_outputs.at('detection_out') + + # Parse faces + sz = cv.gapi.streaming.size(g_in) + faces_rc = cv.gapi.parseSSD(faces, sz, 0.5, False, False) + + # Detect poses + head_inputs = cv.GInferInputs() + head_inputs.setInput('data', g_in) + face_outputs = cv.gapi.infer('head-pose', faces_rc, head_inputs) + angles_y = face_outputs.at('angle_y_fc') + angles_p = face_outputs.at('angle_p_fc') + angles_r = face_outputs.at('angle_r_fc') + + # Parse poses + heads_pos = GProcessPoses.on(angles_y, angles_p, angles_r) + + # Detect landmarks + landmark_inputs = cv.GInferInputs() + landmark_inputs.setInput('data', g_in) + landmark_outputs = cv.gapi.infer('facial-landmarks', faces_rc, + landmark_inputs) + landmark = landmark_outputs.at('align_fc3') + + # Parse landmarks + left_eyes, right_eyes, mids, lmarks = GParseEyes.on(landmark, faces_rc, sz) + + # Detect eyes + eyes_inputs = cv.GInferInputs() + eyes_inputs.setInput('input.1', g_in) + eyesl_outputs = cv.gapi.infer('open-closed-eye', left_eyes, eyes_inputs) + eyesr_outputs = cv.gapi.infer('open-closed-eye', right_eyes, eyes_inputs) + eyesl = eyesl_outputs.at('19') + eyesr = eyesr_outputs.at('19') + + # Process eyes states + l_eye_st, r_eye_st = GGetStates.on(eyesl, eyesr) + + # Gaze estimation + gaze_inputs = cv.GInferListInputs() + gaze_inputs.setInput('left_eye_image', left_eyes) + gaze_inputs.setInput('right_eye_image', right_eyes) + gaze_inputs.setInput('head_pose_angles', heads_pos) + gaze_outputs = cv.gapi.infer2('gaze-estimation', g_in, gaze_inputs) + gaze_vectors = gaze_outputs.at('gaze_vector') + + out = cv.gapi.copy(g_in) + # ------------------------End of graph------------------------ + + comp = cv.GComputation(cv.GIn(g_in), cv.GOut(out, + faces_rc, + left_eyes, + right_eyes, + gaze_vectors, + angles_y, + angles_p, + angles_r, + l_eye_st, + r_eye_st, + mids, + lmarks)) + + # Networks + face_net = cv.gapi.ie.params('face-detection', ARGUMENTS.facem, + weight_path(ARGUMENTS.facem), ARGUMENTS.faced) + head_pose_net = cv.gapi.ie.params('head-pose', ARGUMENTS.headm, + weight_path(ARGUMENTS.headm), ARGUMENTS.headd) + landmarks_net = cv.gapi.ie.params('facial-landmarks', ARGUMENTS.landm, + weight_path(ARGUMENTS.landm), ARGUMENTS.landd) + gaze_net = cv.gapi.ie.params('gaze-estimation', ARGUMENTS.gazem, + weight_path(ARGUMENTS.gazem), ARGUMENTS.gazed) + eye_net = cv.gapi.ie.params('open-closed-eye', ARGUMENTS.eyem, + weight_path(ARGUMENTS.eyem), ARGUMENTS.eyed) + + nets = cv.gapi.networks(face_net, head_pose_net, landmarks_net, gaze_net, eye_net) + + # Kernels pack + kernels = cv.gapi.kernels(GParseEyesImpl, GProcessPosesImpl, GGetStatesImpl) + + # ------------------------Execution part------------------------ + ccomp = comp.compileStreaming(args=cv.gapi.compile_args(kernels, nets)) + source = cv.gapi.wip.make_capture_src(ARGUMENTS.input) + ccomp.setSource(cv.gin(source)) + ccomp.start() + + frames = 0 + fps = 0 + print('Processing') + START_TIME = time.time() + + while True: + start_time_cycle = time.time() + has_frame, (oimg, + outr, + l_eyes, + r_eyes, + outg, + out_y, + out_p, + out_r, + out_st_l, + out_st_r, + out_mids, + outl) = ccomp.pull() + + if not has_frame: + break + + # Draw + GREEN = (0, 255, 0) + RED = (0, 0, 255) + WHITE = (255, 255, 255) + BLUE = (255, 0, 0) + PINK = (255, 0, 255) + YELLOW = (0, 255, 255) + + M_PI_180 = np.pi / 180 + M_PI_2 = np.pi / 2 + M_PI = np.pi + + FACES_SIZE = len(outr) + + for i, out_rect in enumerate(outr): + # Face box + cv.rectangle(oimg, out_rect, WHITE, 1) + rx, ry, rwidth, rheight = out_rect + + # Landmarks + lm_radius = int(0.01 * rwidth + 1) + lmsize = int(len(outl) / FACES_SIZE) + for j in range(lmsize): + cv.circle(oimg, outl[j + i * lmsize], lm_radius, YELLOW, -1) + + # Headposes + yaw = out_y[i] + pitch = out_p[i] + roll = out_r[i] + sin_y = np.sin(yaw[:] * M_PI_180) + sin_p = np.sin(pitch[:] * M_PI_180) + sin_r = np.sin(roll[:] * M_PI_180) + + cos_y = np.cos(yaw[:] * M_PI_180) + cos_p = np.cos(pitch[:] * M_PI_180) + cos_r = np.cos(roll[:] * M_PI_180) + + axis_length = 0.4 * rwidth + x_center = int(rx + rwidth / 2) + y_center = int(ry + rheight / 2) + + # center to right + cv.line(oimg, [x_center, y_center], + [int(x_center + axis_length * (cos_r * cos_y + sin_y * sin_p * sin_r)), + int(y_center + axis_length * cos_p * sin_r)], + RED, 2) + + # center to top + cv.line(oimg, [x_center, y_center], + [int(x_center + axis_length * (cos_r * sin_y * sin_p + cos_y * sin_r)), + int(y_center - axis_length * cos_p * cos_r)], + GREEN, 2) + + # center to forward + cv.line(oimg, [x_center, y_center], + [int(x_center + axis_length * sin_y * cos_p), + int(y_center + axis_length * sin_p)], + PINK, 2) + + scale_box = 0.002 * rwidth + cv.putText(oimg, "head pose: (y=%0.0f, p=%0.0f, r=%0.0f)" % + (np.round(yaw), np.round(pitch), np.round(roll)), + [int(rx), int(ry + rheight + 5 * rwidth / 100)], + cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1) + + # Eyes boxes + color_l = GREEN if out_st_l[i] else RED + cv.rectangle(oimg, l_eyes[i], color_l, 1) + color_r = GREEN if out_st_r[i] else RED + cv.rectangle(oimg, r_eyes[i], color_r, 1) + + # Gaze vectors + norm_gazes = np.linalg.norm(outg[i][0]) + gaze_vector = outg[i][0] / norm_gazes + + arrow_length = 0.4 * rwidth + gaze_arrow = [arrow_length * gaze_vector[0], -arrow_length * gaze_vector[1]] + left_arrow = [int(a+b) for a, b in zip(out_mids[0 + i * 2], gaze_arrow)] + right_arrow = [int(a+b) for a, b in zip(out_mids[1 + i * 2], gaze_arrow)] + if out_st_l[i]: + cv.arrowedLine(oimg, out_mids[0 + i * 2], left_arrow, BLUE, 2) + if out_st_r[i]: + cv.arrowedLine(oimg, out_mids[1 + i * 2], right_arrow, BLUE, 2) + + v0, v1, v2 = outg[i][0] + + gaze_angles = [180 / M_PI * (M_PI_2 + np.arctan2(v2, v0)), + 180 / M_PI * (M_PI_2 - np.arccos(v1 / norm_gazes))] + cv.putText(oimg, "gaze angles: (h=%0.0f, v=%0.0f)" % + (np.round(gaze_angles[0]), np.round(gaze_angles[1])), + [int(rx), int(ry + rheight + 12 * rwidth / 100)], + cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1) + + # Add FPS value to frame + cv.putText(oimg, "FPS: %0i" % (fps), [int(20), int(40)], + cv.FONT_HERSHEY_PLAIN, 2, RED, 2) + + # Show result + cv.imshow('Gaze Estimation', oimg) + + fps = int(1. / (time.time() - start_time_cycle)) + frames += 1 + EXECUTION_TIME = time.time() - START_TIME + print('Execution successful') + print('Mean FPS is ', int(frames / EXECUTION_TIME))