#include #include #include #include #include #include #include #include #include #include // CommandLineParser const std::string about = "This is an OpenCV-based version of Gaze Estimation example"; const std::string keys = "{ h help | | Print this help message }" "{ input | | Path to the input video file }" "{ facem | face-detection-retail-0005.xml | Path to OpenVINO face detection model (.xml) }" "{ faced | CPU | Target device for the face detection (e.g. CPU, GPU, VPU, ...) }" "{ landm | facial-landmarks-35-adas-0002.xml | Path to OpenVINO landmarks detector model (.xml) }" "{ landd | CPU | Target device for the landmarks detector (e.g. CPU, GPU, VPU, ...) }" "{ headm | head-pose-estimation-adas-0001.xml | Path to OpenVINO head pose estimation model (.xml) }" "{ headd | CPU | Target device for the head pose estimation inference (e.g. CPU, GPU, VPU, ...) }" "{ gazem | gaze-estimation-adas-0002.xml | Path to OpenVINO gaze vector estimaiton model (.xml) }" "{ gazed | CPU | Target device for the gaze vector estimation inference (e.g. CPU, GPU, VPU, ...) }" ; namespace { std::string weights_path(const std::string &model_path) { const auto EXT_LEN = 4u; const auto sz = model_path.size(); CV_Assert(sz > EXT_LEN); auto ext = model_path.substr(sz - EXT_LEN); auto lower = [](unsigned char c) { return static_cast(std::tolower(c)); }; std::transform(ext.begin(), ext.end(), ext.begin(), lower); CV_Assert(ext == ".xml"); return model_path.substr(0u, sz - EXT_LEN) + ".bin"; } } // anonymous namespace namespace custom { namespace { using GMat3 = std::tuple; using GMats = cv::GArray; using GRects = cv::GArray; using GSize = cv::GOpaque; G_API_NET(Faces, , "face-detector" ); G_API_NET(Landmarks, , "facial-landmarks"); G_API_NET(HeadPose, < GMat3(cv::GMat)>, "head-pose"); G_API_NET(Gaze, , "gaze-vector"); G_API_OP(Size, , "custom.gapi.size") { static cv::GOpaqueDesc outMeta(const cv::GMatDesc &) { return cv::empty_gopaque_desc(); } }; G_API_OP(ParseSSD, , "custom.gaze_estimation.parseSSD") { static cv::GArrayDesc outMeta( const cv::GMatDesc & , const cv::GOpaqueDesc & , bool) { return cv::empty_array_desc(); } }; // Left/Right eye per every face G_API_OP(ParseEyes, (GMats, GRects, GSize)>, "custom.gaze_estimation.parseEyes") { static std::tuple outMeta( const cv::GArrayDesc & , const cv::GArrayDesc & , const cv::GOpaqueDesc &) { return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc()); } }; // Combine three scalars into a 1x3 vector (per every face) G_API_OP(ProcessPoses, , "custom.gaze_estimation.processPoses") { static cv::GArrayDesc outMeta( const cv::GArrayDesc & , const cv::GArrayDesc & , const cv::GArrayDesc &) { return cv::empty_array_desc(); } }; void adjustBoundingBox(cv::Rect& boundingBox) { auto w = boundingBox.width; auto h = boundingBox.height; boundingBox.x -= static_cast(0.067 * w); boundingBox.y -= static_cast(0.028 * h); boundingBox.width += static_cast(0.15 * w); boundingBox.height += static_cast(0.13 * h); if (boundingBox.width < boundingBox.height) { auto dx = (boundingBox.height - boundingBox.width); boundingBox.x -= dx / 2; boundingBox.width += dx; } else { auto dy = (boundingBox.width - boundingBox.height); boundingBox.y -= dy / 2; boundingBox.height += dy; } } void gazeVectorToGazeAngles(const cv::Point3f& gazeVector, cv::Point2f& gazeAngles) { auto r = cv::norm(gazeVector); double v0 = static_cast(gazeVector.x); double v1 = static_cast(gazeVector.y); double v2 = static_cast(gazeVector.z); gazeAngles.x = static_cast(180.0 / M_PI * (M_PI_2 + std::atan2(v2, v0))); gazeAngles.y = static_cast(180.0 / M_PI * (M_PI_2 - std::acos(v1 / r))); } GAPI_OCV_KERNEL(OCVSize, Size) { static void run(const cv::Mat &in, cv::Size &out) { out = in.size(); } }; GAPI_OCV_KERNEL(OCVParseSSD, ParseSSD) { static void run(const cv::Mat &in_ssd_result, const cv::Size &upscale, const bool filter_out_of_bounds, std::vector &out_objects) { const auto &in_ssd_dims = in_ssd_result.size; CV_Assert(in_ssd_dims.dims() == 4u); const int MAX_PROPOSALS = in_ssd_dims[2]; const int OBJECT_SIZE = in_ssd_dims[3]; CV_Assert(OBJECT_SIZE == 7); // fixed SSD object size const cv::Rect surface({0,0}, upscale); out_objects.clear(); const float *data = in_ssd_result.ptr(); for (int i = 0; i < MAX_PROPOSALS; i++) { const float image_id = data[i * OBJECT_SIZE + 0]; const float label = data[i * OBJECT_SIZE + 1]; const float confidence = data[i * OBJECT_SIZE + 2]; const float rc_left = data[i * OBJECT_SIZE + 3]; const float rc_top = data[i * OBJECT_SIZE + 4]; const float rc_right = data[i * OBJECT_SIZE + 5]; const float rc_bottom = data[i * OBJECT_SIZE + 6]; (void) label; if (image_id < 0.f) { break; // marks end-of-detections } if (confidence < 0.5f) { continue; // skip objects with low confidence } cv::Rect rc; // map relative coordinates to the original image scale rc.x = static_cast(rc_left * upscale.width); rc.y = static_cast(rc_top * upscale.height); rc.width = static_cast(rc_right * upscale.width) - rc.x; rc.height = static_cast(rc_bottom * upscale.height) - rc.y; adjustBoundingBox(rc); // TODO: new option? const auto clipped_rc = rc & surface; // TODO: new option? if (filter_out_of_bounds) { if (clipped_rc.area() != rc.area()) { continue; } } out_objects.emplace_back(clipped_rc); } } }; cv::Rect eyeBox(const cv::Rect &face_rc, float p1_x, float p1_y, float p2_x, float p2_y, float scale = 1.8f) { const auto &up = face_rc.size(); const cv::Point p1 = { static_cast(p1_x*up.width), static_cast(p1_y*up.height) }; const cv::Point p2 = { static_cast(p2_x*up.width), static_cast(p2_y*up.height) }; cv::Rect result; const auto size = static_cast(cv::norm(p1 - p2)); const auto midpoint = (p1 + p2) / 2; result.width = static_cast(scale * size); result.height = result.width; result.x = face_rc.x + midpoint.x - (result.width / 2); result.y = face_rc.y + midpoint.y - (result.height / 2); // Shift result to the original frame's absolute coordinates return result; } GAPI_OCV_KERNEL(OCVParseEyes, ParseEyes) { static void run(const std::vector &in_landmarks_per_face, const std::vector &in_face_rcs, const cv::Size &frame_size, std::vector &out_left_eyes, std::vector &out_right_eyes) { const size_t numFaces = in_landmarks_per_face.size(); const cv::Rect surface(cv::Point(0,0), frame_size); GAPI_Assert(numFaces == in_face_rcs.size()); out_left_eyes.clear(); out_right_eyes.clear(); out_left_eyes.reserve(numFaces); out_right_eyes.reserve(numFaces); for (std::size_t i = 0u; i < numFaces; i++) { const auto &lm = in_landmarks_per_face[i]; const auto &rc = in_face_rcs[i]; // Left eye is defined by points 0/1 (x2), // Right eye is defined by points 2/3 (x2) const float *data = lm.ptr(); out_left_eyes .push_back(surface & eyeBox(rc, data[0], data[1], data[2], data[3])); out_right_eyes.push_back(surface & eyeBox(rc, data[4], data[5], data[6], data[7])); } } }; GAPI_OCV_KERNEL(OCVProcessPoses, ProcessPoses) { static void run(const std::vector &in_ys, const std::vector &in_ps, const std::vector &in_rs, std::vector &out_poses) { const std::size_t sz = in_ys.size(); GAPI_Assert(sz == in_ps.size() && sz == in_rs.size()); out_poses.clear(); for (std::size_t idx = 0u; idx < sz; idx++) { cv::Mat pose(1, 3, CV_32FC1); float *ptr = pose.ptr(); ptr[0] = in_ys[idx].ptr()[0]; ptr[1] = in_ps[idx].ptr()[0]; ptr[2] = in_rs[idx].ptr()[0]; out_poses.push_back(std::move(pose)); } } }; } // anonymous namespace } // namespace custom namespace vis { namespace { cv::Point2f midp(const cv::Rect &rc) { return (rc.tl() + rc.br()) / 2; }; void bbox(cv::Mat &m, const cv::Rect &rc) { cv::rectangle(m, rc, cv::Scalar{0,255,0}, 2, cv::LINE_8, 0); }; void pose(cv::Mat &m, const cv::Mat &p, const cv::Rect &face_rc) { const auto *posePtr = p.ptr(); const auto yaw = static_cast(posePtr[0]); const auto pitch = static_cast(posePtr[1]); const auto roll = static_cast(posePtr[2]); const auto sinY = std::sin(yaw * M_PI / 180.0); const auto sinP = std::sin(pitch * M_PI / 180.0); const auto sinR = std::sin(roll * M_PI / 180.0); const auto cosY = std::cos(yaw * M_PI / 180.0); const auto cosP = std::cos(pitch * M_PI / 180.0); const auto cosR = std::cos(roll * M_PI / 180.0); const auto axisLength = 0.4 * face_rc.width; const auto xCenter = face_rc.x + face_rc.width / 2; const auto yCenter = face_rc.y + face_rc.height / 2; const auto center = cv::Point{xCenter, yCenter}; const auto axisln = cv::Point2d{axisLength, axisLength}; const auto ctr = cv::Matx(cosR*cosY, sinY*sinP*sinR, 0.f, cosP*sinR); const auto ctt = cv::Matx(cosR*sinY*sinP, cosY*sinR, 0.f, -cosP*cosR); const auto ctf = cv::Matx(sinY*cosP, 0.f, 0.f, sinP); // center to right cv::line(m, center, center + static_cast(ctr*axisln), cv::Scalar(0, 0, 255), 2); // center to top cv::line(m, center, center + static_cast(ctt*axisln), cv::Scalar(0, 255, 0), 2); // center to forward cv::line(m, center, center + static_cast(ctf*axisln), cv::Scalar(255, 0, 255), 2); } void vvec(cv::Mat &m, const cv::Mat &v, const cv::Rect &face_rc, const cv::Rect &left_rc, const cv::Rect &right_rc) { const auto scale = 0.002 * face_rc.width; cv::Point3f gazeVector; const auto *gazePtr = v.ptr(); gazeVector.x = gazePtr[0]; gazeVector.y = gazePtr[1]; gazeVector.z = gazePtr[2]; gazeVector = gazeVector / cv::norm(gazeVector); const double arrowLength = 0.4 * face_rc.width; const auto left_mid = midp(left_rc); const auto right_mid = midp(right_rc); cv::Point2f gazeArrow; gazeArrow.x = gazeVector.x; gazeArrow.y = -gazeVector.y; gazeArrow *= arrowLength; cv::arrowedLine(m, left_mid, left_mid + gazeArrow, cv::Scalar(255, 0, 0), 2); cv::arrowedLine(m, right_mid, right_mid + gazeArrow, cv::Scalar(255, 0, 0), 2); cv::Point2f gazeAngles; custom::gazeVectorToGazeAngles(gazeVector, gazeAngles); cv::putText(m, cv::format("gaze angles: (h=%0.0f, v=%0.0f)", static_cast(std::round(gazeAngles.x)), static_cast(std::round(gazeAngles.y))), cv::Point(static_cast(face_rc.tl().x), static_cast(face_rc.br().y + 12. * face_rc.width / 100.)), cv::FONT_HERSHEY_PLAIN, scale * 2, cv::Scalar::all(255), 1); }; } // anonymous namespace } // namespace vis int main(int argc, char *argv[]) { cv::CommandLineParser cmd(argc, argv, keys); cmd.about(about); if (cmd.has("help")) { cmd.printMessage(); return 0; } cv::GMat in; cv::GMat faces = cv::gapi::infer(in); cv::GOpaque sz = custom::Size::on(in); // FIXME cv::GArray faces_rc = custom::ParseSSD::on(faces, sz, true); cv::GArray angles_y, angles_p, angles_r; std::tie(angles_y, angles_p, angles_r) = cv::gapi::infer(faces_rc, in); cv::GArray heads_pos = custom::ProcessPoses::on(angles_y, angles_p, angles_r); cv::GArray landmarks = cv::gapi::infer(faces_rc, in); cv::GArray left_eyes, right_eyes; std::tie(left_eyes, right_eyes) = custom::ParseEyes::on(landmarks, faces_rc, sz); cv::GArray gaze_vectors = cv::gapi::infer2( in , left_eyes , right_eyes , heads_pos); cv::GComputation graph(cv::GIn(in), cv::GOut( cv::gapi::copy(in) , faces_rc , left_eyes , right_eyes , heads_pos , gaze_vectors)); const auto input_file_name = cmd.get("input"); const auto face_model_path = cmd.get("facem"); const auto head_model_path = cmd.get("headm"); const auto lmrk_model_path = cmd.get("landm"); const auto gaze_model_path = cmd.get("gazem"); auto face_net = cv::gapi::ie::Params { face_model_path, // path to topology IR weights_path(face_model_path), // path to weights cmd.get("faced"), /// device specifier }; auto head_net = cv::gapi::ie::Params { head_model_path, // path to topology IR weights_path(head_model_path), // path to weights cmd.get("headd"), // device specifier }.cfgOutputLayers({"angle_y_fc", "angle_p_fc", "angle_r_fc"}); auto landmarks_net = cv::gapi::ie::Params { lmrk_model_path, // path to topology IR weights_path(lmrk_model_path), // path to weights cmd.get("landd"), // device specifier }; auto gaze_net = cv::gapi::ie::Params { gaze_model_path, // path to topology IR weights_path(gaze_model_path), // path to weights cmd.get("gazed"), // device specifier }.cfgInputLayers({"left_eye_image", "right_eye_image", "head_pose_angles"}); auto kernels = cv::gapi::kernels< custom::OCVSize , custom::OCVParseSSD , custom::OCVParseEyes , custom::OCVProcessPoses>(); auto networks = cv::gapi::networks(face_net, head_net, landmarks_net, gaze_net); auto pipeline = graph.compileStreaming(cv::compile_args(networks, kernels)); cv::TickMeter tm; cv::Mat image; std::vector out_faces, out_right_eyes, out_left_eyes; std::vector out_poses; std::vector out_gazes; std::size_t frames = 0u; std::cout << "Reading " << input_file_name << std::endl; pipeline.setSource(cv::gapi::wip::make_src(input_file_name)); pipeline.start(); tm.start(); while (pipeline.pull(cv::gout( image , out_faces , out_left_eyes , out_right_eyes , out_poses , out_gazes))) { frames++; // Visualize results on the frame for (auto &&rc : out_faces) vis::bbox(image, rc); for (auto &&rc : out_left_eyes) vis::bbox(image, rc); for (auto &&rc : out_right_eyes) vis::bbox(image, rc); for (std::size_t i = 0u; i < out_faces.size(); i++) { vis::pose(image, out_poses[i], out_faces[i]); vis::vvec(image, out_gazes[i], out_faces[i], out_left_eyes[i], out_right_eyes[i]); } tm.stop(); const auto fps_str = std::to_string(frames / tm.getTimeSec()) + " FPS"; cv::putText(image, fps_str, {0,32}, cv::FONT_HERSHEY_SIMPLEX, 1.0, {0,255,0}, 2); cv::imshow("Out", image); cv::waitKey(1); tm.start(); } tm.stop(); std::cout << "Processed " << frames << " frames" << " (" << frames / tm.getTimeSec() << " FPS)" << std::endl; return 0; }