Merge pull request #1199 from sukhad-app:face_alignment
Face alignment (#1199) * This commit will add a new functionality of one millisecond face_alignment to OpenCV. Face alignment is a computer vision technology for identifying the geometric structure of human faces in digital images. Given the location and size of a face, it automatically determines the shape of the face components such as eyes and nose. Added following functions : 1) Application to train a face landmark detector. 2) Application to detect face landmarks using a trained model. 3) Application to swap faces using face landmark detection 4) Application to detect landmarks in a video. Merged the code with a global facemark API. Added Doxygen Documentation for the Class created. Added tutorials for the samples added. Added visualisations depicting error rate and training time. Made desired changes fix fix fix fix fix fix fix fix fix * face: drop duplicated file -face_alignmentImpl.hpp +face_alignmentimpl.hpp * face: minor refactoring - replace license headers - fix usage of "precomp.hpp"pull/1430/head
@ -1,3 +1,27 @@ |
||||
set(the_description "Face recognition etc") |
||||
ocv_define_module(face opencv_core opencv_imgproc opencv_objdetect WRAP python) |
||||
ocv_define_module(face opencv_core |
||||
opencv_imgproc |
||||
opencv_objdetect |
||||
opencv_tracking # estimateRigidTransform() |
||||
opencv_photo # samples |
||||
WRAP python |
||||
) |
||||
# NOTE: objdetect module is needed for one of the samples |
||||
|
||||
set(__commit_hash "8afa57abc8229d611c4937165d20e2a2d9fc5a12") |
||||
set(__file_hash "7505c44ca4eb54b4ab1e4777cb96ac05") |
||||
ocv_download( |
||||
FILENAME face_landmark_model.dat |
||||
HASH ${__file_hash} |
||||
URL |
||||
"${OPENCV_FACE_ALIGNMENT_URL}" |
||||
"$ENV{OPENCV_FACE_ALIGNMENT_URL}" |
||||
"https://raw.githubusercontent.com/opencv/opencv_3rdparty/${__commit_hash}/" |
||||
DESTINATION_DIR "${CMAKE_BINARY_DIR}/${OPENCV_TEST_DATA_INSTALL_PATH}/cv/face/" |
||||
ID "data" |
||||
RELATIVE_URL |
||||
STATUS res |
||||
) |
||||
if(NOT res) |
||||
message(WARNING "Face: Can't get model file for face alignment.") |
||||
endif() |
||||
|
@ -0,0 +1,71 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef __OPENCV_FACE_ALIGNMENT_HPP__ |
||||
#define __OPENCV_FACE_ALIGNMENT_HPP__ |
||||
|
||||
#include "facemark.hpp" |
||||
|
||||
namespace cv{ |
||||
namespace face{ |
||||
class CV_EXPORTS_W FacemarkKazemi : public Algorithm |
||||
{ |
||||
public: |
||||
struct CV_EXPORTS Params |
||||
{ |
||||
/**
|
||||
* \brief Constructor |
||||
*/ |
||||
Params(); |
||||
/// cascade_depth This stores the deapth of cascade used for training.
|
||||
unsigned long cascade_depth; |
||||
/// tree_depth This stores the max height of the regression tree built.
|
||||
unsigned long tree_depth; |
||||
/// num_trees_per_cascade_level This stores number of trees fit per cascade level.
|
||||
unsigned long num_trees_per_cascade_level; |
||||
/// learning_rate stores the learning rate in gradient boosting, also reffered as shrinkage.
|
||||
float learning_rate; |
||||
/// oversampling_amount stores number of initialisations used to create training samples.
|
||||
unsigned long oversampling_amount; |
||||
/// num_test_coordinates stores number of test coordinates.
|
||||
unsigned long num_test_coordinates; |
||||
/// lambda stores a value to calculate probability of closeness of two coordinates.
|
||||
float lambda; |
||||
/// num_test_splits stores number of random test splits generated.
|
||||
unsigned long num_test_splits; |
||||
/// configfile stores the name of the file containing the values of training parameters
|
||||
String configfile; |
||||
}; |
||||
static Ptr<FacemarkKazemi> create(const FacemarkKazemi::Params ¶meters = FacemarkKazemi::Params()); |
||||
virtual ~FacemarkKazemi(); |
||||
|
||||
/// @brief training the facemark model, input are the file names of image list and landmark annotation
|
||||
virtual void training(String imageList, String groundTruth)=0; |
||||
/** @brief This function is used to train the model using gradient boosting to get a cascade of regressors
|
||||
*which can then be used to predict shape. |
||||
*@param images A vector of type cv::Mat which stores the images which are used in training samples. |
||||
*@param landmarks A vector of vectors of type cv::Point2f which stores the landmarks detected in a particular image. |
||||
*@param scale A size of type cv::Size to which all images and landmarks have to be scaled to. |
||||
*@param configfile A variable of type std::string which stores the name of the file storing parameters for training the model. |
||||
*@param modelFilename A variable of type std::string which stores the name of the trained model file that has to be saved. |
||||
*@returns A boolean value. The function returns true if the model is trained properly or false if it is not trained. |
||||
*/ |
||||
virtual bool training(std::vector<Mat>& images, std::vector< std::vector<Point2f> >& landmarks,std::string configfile,Size scale,std::string modelFilename = "face_landmarks.dat")=0; |
||||
/** @brief This function is used to load the trained model..
|
||||
*@param filename A variable of type cv::String which stores the name of the file in which trained model is stored. |
||||
*/ |
||||
virtual void loadModel(String filename)=0; |
||||
/** @brief This functions retrieves a centered and scaled face shape, according to the bounding rectangle.
|
||||
*@param image A variable of type cv::InputArray which stores the image whose landmarks have to be found |
||||
*@param faces A variable of type cv::InputArray which stores the bounding boxes of faces found in a given image. |
||||
*@param landmarks A variable of type cv::InputOutputArray which stores the landmarks of all the faces found in the image |
||||
*/ |
||||
virtual bool fit( InputArray image, InputArray faces, InputOutputArray landmarks )=0;//!< from many ROIs
|
||||
/// set the custom face detector
|
||||
virtual bool setFaceDetector(bool(*f)(InputArray , OutputArray, void*), void* userData)=0; |
||||
/// get faces using the custom detector
|
||||
virtual bool getFaces(InputArray image, OutputArray faces)=0; |
||||
}; |
||||
|
||||
}} // namespace
|
||||
#endif |
@ -0,0 +1,93 @@ |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/videoio.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include <iostream> |
||||
#include <vector> |
||||
#include <string> |
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
|
||||
static bool myDetector(InputArray image, OutputArray faces, CascadeClassifier *face_cascade) |
||||
{ |
||||
Mat gray; |
||||
|
||||
if (image.channels() > 1) |
||||
cvtColor(image, gray, COLOR_BGR2GRAY); |
||||
else |
||||
gray = image.getMat().clone(); |
||||
|
||||
equalizeHist(gray, gray); |
||||
|
||||
std::vector<Rect> faces_; |
||||
face_cascade->detectMultiScale(gray, faces_, 1.4, 2, CASCADE_SCALE_IMAGE, Size(30, 30)); |
||||
Mat(faces_).copyTo(faces); |
||||
return true; |
||||
} |
||||
|
||||
int main(int argc,char** argv){ |
||||
//Give the path to the directory containing all the files containing data
|
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ model_filename f | | (required) path to binary file storing the trained model which is to be loaded [example - /data/file.dat]}" |
||||
"{ image i | | (required) path to image in which face landmarks have to be detected.[example - /data/image.jpg] }" |
||||
"{ face_cascade c | | Path to the face cascade xml file which you want to use as a detector}" |
||||
); |
||||
// Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
string filename(parser.get<string>("model_filename")); |
||||
if (filename.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the model file to be loaded for detecting landmarks is not found" << endl; |
||||
return -1; |
||||
} |
||||
string image(parser.get<string>("image")); |
||||
if (image.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the image file in which landmarks have to be detected is not found" << endl; |
||||
return -1; |
||||
} |
||||
string cascade_name(parser.get<string>("face_cascade")); |
||||
if (cascade_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the cascade classifier to be loaded to detect faces is not found" << endl; |
||||
return -1; |
||||
} |
||||
|
||||
Mat img = imread(image); |
||||
|
||||
//pass the face cascade xml file which you want to pass as a detector
|
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
Ptr<FacemarkKazemi> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector((FN_FaceDetector)myDetector, &face_cascade); |
||||
facemark->loadModel(filename); |
||||
cout<<"Loaded model"<<endl; |
||||
vector<Rect> faces; |
||||
resize(img,img,Size(460,460)); |
||||
facemark->getFaces(img,faces); |
||||
vector< vector<Point2f> > shapes; |
||||
if(facemark->fit(img,faces,shapes)) |
||||
{ |
||||
for( size_t i = 0; i < faces.size(); i++ ) |
||||
{ |
||||
cv::rectangle(img,faces[i],Scalar( 255, 0, 0 )); |
||||
} |
||||
for(unsigned long i=0;i<faces.size();i++){ |
||||
for(unsigned long k=0;k<shapes[i].size();k++) |
||||
cv::circle(img,shapes[i][k],5,cv::Scalar(0,0,255),FILLED); |
||||
} |
||||
namedWindow("Detected_shape"); |
||||
imshow("Detected_shape",img); |
||||
waitKey(0); |
||||
} |
||||
return 0; |
||||
} |
@ -0,0 +1,110 @@ |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include "opencv2/videoio.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include <iostream> |
||||
#include <vector> |
||||
#include <string> |
||||
|
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
|
||||
static bool myDetector(InputArray image, OutputArray faces, CascadeClassifier *face_cascade) |
||||
{ |
||||
Mat gray; |
||||
|
||||
if (image.channels() > 1) |
||||
cvtColor(image, gray, COLOR_BGR2GRAY); |
||||
else |
||||
gray = image.getMat().clone(); |
||||
|
||||
equalizeHist(gray, gray); |
||||
|
||||
std::vector<Rect> faces_; |
||||
face_cascade->detectMultiScale(gray, faces_, 1.4, 2, CASCADE_SCALE_IMAGE, Size(30, 30)); |
||||
Mat(faces_).copyTo(faces); |
||||
return true; |
||||
} |
||||
|
||||
int main(int argc,char** argv){ |
||||
//Give the path to the directory containing all the files containing data
|
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ model_filename f | | (required) path to binary file storing the trained model which is to be loaded [example - /data/file.dat]}" |
||||
"{ video v | | (required) path to video in which face landmarks have to be detected.[example - /data/video.avi] }" |
||||
"{ face_cascade c | | Path to the face cascade xml file which you want to use as a detector}" |
||||
); |
||||
// Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
string filename(parser.get<string>("model_filename")); |
||||
if (filename.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the model file to be loaded for detecting landmarks is not found" << endl; |
||||
return -1; |
||||
} |
||||
string video(parser.get<string>("video")); |
||||
if (video.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the video file in which landmarks have to be detected is not found" << endl; |
||||
return -1; |
||||
} |
||||
string cascade_name(parser.get<string>("face_cascade")); |
||||
if (cascade_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the cascade classifier to be loaded to detect faces is not found" << endl; |
||||
return -1; |
||||
} |
||||
VideoCapture cap(video); |
||||
if(!cap.isOpened()){ |
||||
cerr<<"Video cannot be loaded. Give correct path"<<endl; |
||||
return -1; |
||||
} |
||||
//pass the face cascade xml file which you want to pass as a detector
|
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
Ptr<FacemarkKazemi> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector((FN_FaceDetector)myDetector, &face_cascade); |
||||
facemark->loadModel(filename); |
||||
cout<<"Loaded model"<<endl; |
||||
//vector to store the faces detected in the image
|
||||
vector<Rect> faces; |
||||
vector< vector<Point2f> > shapes; |
||||
Mat img; |
||||
while(1){ |
||||
faces.clear(); |
||||
shapes.clear(); |
||||
cap>>img; |
||||
//Detect faces in the current image
|
||||
resize(img,img,Size(600,600)); |
||||
facemark->getFaces(img,faces); |
||||
if(faces.size()==0){ |
||||
cout<<"No faces found in this frame"<<endl; |
||||
} |
||||
else{ |
||||
for( size_t i = 0; i < faces.size(); i++ ) |
||||
{ |
||||
cv::rectangle(img,faces[i],Scalar( 255, 0, 0 )); |
||||
} |
||||
//vector to store the landmarks of all the faces in the image
|
||||
if(facemark->fit(img,faces,shapes)) |
||||
{ |
||||
for(unsigned long i=0;i<faces.size();i++){ |
||||
for(unsigned long k=0;k<shapes[i].size();k++) |
||||
cv::circle(img,shapes[i][k],3,cv::Scalar(0,0,255),FILLED); |
||||
} |
||||
} |
||||
} |
||||
namedWindow("Detected_shape"); |
||||
imshow("Detected_shape",img); |
||||
if(waitKey(1) >= 0) break; |
||||
} |
||||
return 0; |
||||
} |
@ -0,0 +1,20 @@ |
||||
<?xml version="1.0"?> |
||||
<!-- cascade_depth stores the depth of cascade of regressors used for training. |
||||
tree_depth stores the depth of trees created as weak learners during gradient boosting. |
||||
num_trees_per_cascade_level stores number of trees required per cascade level. |
||||
learning_rate stores the learning rate for gradient boosting.This is required to prevent overfitting using shrinkage. |
||||
oversampling_amount stores the oversampling amount for the samples. |
||||
num_test_coordinates stores number of test coordinates to be generated as samples to decide for making the split. |
||||
lambda stores the value used for calculating the probabilty which helps to select closer pixels for making the split. |
||||
num_test_splits stores the number of test splits to be generated before making the best split. |
||||
--> |
||||
<opencv_storage> |
||||
<cascade_depth>15</cascade_depth> |
||||
<tree_depth>4</tree_depth> |
||||
<num_trees_per_cascade_level>500</num_trees_per_cascade_level> |
||||
<learning_rate>1.0000000149011612e-01</learning_rate> |
||||
<oversampling_amount>20</oversampling_amount> |
||||
<num_test_coordinates>400</num_test_coordinates> |
||||
<lambda>1.0000000149011612e-01</lambda> |
||||
<num_test_splits>20</num_test_splits> |
||||
</opencv_storage> |
@ -0,0 +1,202 @@ |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include "opencv2/photo.hpp" // seamlessClone() |
||||
#include <iostream> |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
using namespace std; |
||||
|
||||
static bool myDetector(InputArray image, OutputArray faces, CascadeClassifier *face_cascade) |
||||
{ |
||||
Mat gray; |
||||
|
||||
if (image.channels() > 1) |
||||
cvtColor(image, gray, COLOR_BGR2GRAY); |
||||
else |
||||
gray = image.getMat().clone(); |
||||
|
||||
equalizeHist(gray, gray); |
||||
|
||||
std::vector<Rect> faces_; |
||||
face_cascade->detectMultiScale(gray, faces_, 1.4, 2, CASCADE_SCALE_IMAGE, Size(30, 30)); |
||||
Mat(faces_).copyTo(faces); |
||||
return true; |
||||
} |
||||
|
||||
void divideIntoTriangles(Rect rect, vector<Point2f> &points, vector< vector<int> > &delaunayTri); |
||||
void warpTriangle(Mat &img1, Mat &img2, vector<Point2f> &triangle1, vector<Point2f> &triangle2); |
||||
|
||||
//Divide the face into triangles for warping
|
||||
void divideIntoTriangles(Rect rect, vector<Point2f> &points, vector< vector<int> > &Tri){ |
||||
|
||||
// Create an instance of Subdiv2D
|
||||
Subdiv2D subdiv(rect); |
||||
// Insert points into subdiv
|
||||
for( vector<Point2f>::iterator it = points.begin(); it != points.end(); it++) |
||||
subdiv.insert(*it); |
||||
vector<Vec6f> triangleList; |
||||
subdiv.getTriangleList(triangleList); |
||||
vector<Point2f> pt(3); |
||||
vector<int> ind(3); |
||||
for( size_t i = 0; i < triangleList.size(); i++ ) |
||||
{ |
||||
Vec6f triangle = triangleList[i]; |
||||
pt[0] = Point2f(triangle[0], triangle[1]); |
||||
pt[1] = Point2f(triangle[2], triangle[3]); |
||||
pt[2] = Point2f(triangle[4], triangle[5]); |
||||
if ( rect.contains(pt[0]) && rect.contains(pt[1]) && rect.contains(pt[2])){ |
||||
for(int j = 0; j < 3; j++) |
||||
for(size_t k = 0; k < points.size(); k++) |
||||
if(abs(pt[j].x - points[k].x) < 1.0 && abs(pt[j].y - points[k].y) < 1) |
||||
ind[j] =(int) k; |
||||
Tri.push_back(ind); |
||||
} |
||||
} |
||||
} |
||||
void warpTriangle(Mat &img1, Mat &img2, vector<Point2f> &triangle1, vector<Point2f> &triangle2) |
||||
{ |
||||
Rect rectangle1 = boundingRect(triangle1); |
||||
Rect rectangle2 = boundingRect(triangle2); |
||||
// Offset points by left top corner of the respective rectangles
|
||||
vector<Point2f> triangle1Rect, triangle2Rect; |
||||
vector<Point> triangle2RectInt; |
||||
for(int i = 0; i < 3; i++) |
||||
{ |
||||
triangle1Rect.push_back( Point2f( triangle1[i].x - rectangle1.x, triangle1[i].y - rectangle1.y) ); |
||||
triangle2Rect.push_back( Point2f( triangle2[i].x - rectangle2.x, triangle2[i].y - rectangle2.y) ); |
||||
triangle2RectInt.push_back( Point((int)(triangle2[i].x - rectangle2.x),(int) (triangle2[i].y - rectangle2.y))); // for fillConvexPoly
|
||||
} |
||||
// Get mask by filling triangle
|
||||
Mat mask = Mat::zeros(rectangle2.height, rectangle2.width, CV_32FC3); |
||||
fillConvexPoly(mask, triangle2RectInt, Scalar(1.0, 1.0, 1.0), 16, 0); |
||||
// Apply warpImage to small rectangular patches
|
||||
Mat img1Rect; |
||||
img1(rectangle1).copyTo(img1Rect); |
||||
Mat img2Rect = Mat::zeros(rectangle2.height, rectangle2.width, img1Rect.type()); |
||||
Mat warp_mat = getAffineTransform(triangle1Rect, triangle2Rect); |
||||
warpAffine( img1Rect, img2Rect, warp_mat, img2Rect.size(), INTER_LINEAR, BORDER_REFLECT_101); |
||||
multiply(img2Rect,mask, img2Rect); |
||||
multiply(img2(rectangle2), Scalar(1.0,1.0,1.0) - mask, img2(rectangle2)); |
||||
img2(rectangle2) = img2(rectangle2) + img2Rect; |
||||
} |
||||
int main( int argc, char** argv) |
||||
{ |
||||
//Give the path to the directory containing all the files containing data
|
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ image1 i1 | | (required) path to the first image file in which you want to apply swapping }" |
||||
"{ image2 i2 | | (required) path to the second image file in which you want to apply face swapping }" |
||||
"{ model m | | (required) path to the file containing model to be loaded for face landmark detection}" |
||||
"{ face_cascade f | | Path to the face cascade xml file which you want to use as a detector}" |
||||
); |
||||
// Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
Mat img1=imread(parser.get<string>("image1")); |
||||
Mat img2=imread(parser.get<string>("image2")); |
||||
if (img1.empty()||img2.empty()){ |
||||
if(img1.empty()){ |
||||
parser.printMessage(); |
||||
cerr << parser.get<string>("image1")<<" not found" << endl; |
||||
return -1; |
||||
} |
||||
if (img2.empty()){ |
||||
parser.printMessage(); |
||||
cerr << parser.get<string>("image2")<<" not found" << endl; |
||||
return -1; |
||||
} |
||||
} |
||||
string modelfile_name(parser.get<string>("model")); |
||||
if (modelfile_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "Model file name not found." << endl; |
||||
return -1; |
||||
} |
||||
string cascade_name(parser.get<string>("face_cascade")); |
||||
if (cascade_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the cascade classifier to be loaded to detect faces is not found" << endl; |
||||
return -1; |
||||
} |
||||
//create a pointer to call the base class
|
||||
//pass the face cascade xml file which you want to pass as a detector
|
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
Ptr<FacemarkKazemi> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector((FN_FaceDetector)myDetector, &face_cascade); |
||||
facemark->loadModel(modelfile_name); |
||||
cout<<"Loaded model"<<endl; |
||||
//vector to store the faces detected in the image
|
||||
vector<Rect> faces1,faces2; |
||||
vector< vector<Point2f> > shape1,shape2; |
||||
//Detect faces in the current image
|
||||
float ratio1 = (float)img1.cols/(float)img1.rows; |
||||
float ratio2 = (float)img2.cols/(float)img2.rows; |
||||
resize(img1,img1,Size((int)(640*ratio1),(int)(640*ratio1))); |
||||
resize(img2,img2,Size((int)(640*ratio2),(int)(640*ratio2))); |
||||
Mat img1Warped = img2.clone(); |
||||
facemark->getFaces(img1,faces1); |
||||
facemark->getFaces(img2,faces2); |
||||
//Initialise the shape of the faces
|
||||
facemark->fit(img1,faces1,shape1); |
||||
facemark->fit(img2,faces2,shape2); |
||||
unsigned long numswaps = (unsigned long)min((unsigned long)shape1.size(),(unsigned long)shape2.size()); |
||||
for(unsigned long z=0;z<numswaps;z++){ |
||||
vector<Point2f> points1 = shape1[z]; |
||||
vector<Point2f> points2 = shape2[z]; |
||||
img1.convertTo(img1, CV_32F); |
||||
img1Warped.convertTo(img1Warped, CV_32F); |
||||
// Find convex hull
|
||||
vector<Point2f> boundary_image1; |
||||
vector<Point2f> boundary_image2; |
||||
vector<int> index; |
||||
convexHull(Mat(points2),index, false, false); |
||||
for(size_t i = 0; i < index.size(); i++) |
||||
{ |
||||
boundary_image1.push_back(points1[index[i]]); |
||||
boundary_image2.push_back(points2[index[i]]); |
||||
} |
||||
// Triangulation for points on the convex hull
|
||||
vector< vector<int> > triangles; |
||||
Rect rect(0, 0, img1Warped.cols, img1Warped.rows); |
||||
divideIntoTriangles(rect, boundary_image2, triangles); |
||||
// Apply affine transformation to Delaunay triangles
|
||||
for(size_t i = 0; i < triangles.size(); i++) |
||||
{ |
||||
vector<Point2f> triangle1, triangle2; |
||||
// Get points for img1, img2 corresponding to the triangles
|
||||
for(int j = 0; j < 3; j++) |
||||
{ |
||||
triangle1.push_back(boundary_image1[triangles[i][j]]); |
||||
triangle2.push_back(boundary_image2[triangles[i][j]]); |
||||
} |
||||
warpTriangle(img1, img1Warped, triangle1, triangle2); |
||||
} |
||||
// Calculate mask
|
||||
vector<Point> hull; |
||||
for(size_t i = 0; i < boundary_image2.size(); i++) |
||||
{ |
||||
Point pt((int)boundary_image2[i].x,(int)boundary_image2[i].y); |
||||
hull.push_back(pt); |
||||
} |
||||
Mat mask = Mat::zeros(img2.rows, img2.cols, img2.depth()); |
||||
fillConvexPoly(mask,&hull[0],(int)hull.size(), Scalar(255,255,255)); |
||||
// Clone seamlessly.
|
||||
Rect r = boundingRect(boundary_image2); |
||||
Point center = (r.tl() + r.br()) / 2; |
||||
Mat output; |
||||
img1Warped.convertTo(img1Warped, CV_8UC3); |
||||
seamlessClone(img1Warped,img2, mask, center, output, NORMAL_CLONE); |
||||
imshow("Face_Swapped", output); |
||||
waitKey(0); |
||||
destroyAllWindows(); |
||||
} |
||||
return 0; |
||||
} |
@ -0,0 +1,117 @@ |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include <iostream> |
||||
#include <vector> |
||||
#include <string> |
||||
|
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
|
||||
static bool myDetector(InputArray image, OutputArray faces, CascadeClassifier *face_cascade) |
||||
{ |
||||
Mat gray; |
||||
|
||||
if (image.channels() > 1) |
||||
cvtColor(image, gray, COLOR_BGR2GRAY); |
||||
else |
||||
gray = image.getMat().clone(); |
||||
|
||||
equalizeHist(gray, gray); |
||||
|
||||
std::vector<Rect> faces_; |
||||
face_cascade->detectMultiScale(gray, faces_, 1.4, 2, CASCADE_SCALE_IMAGE, Size(30, 30)); |
||||
Mat(faces_).copyTo(faces); |
||||
return true; |
||||
} |
||||
|
||||
int main(int argc,char** argv){ |
||||
//Give the path to the directory containing all the files containing data
|
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ annotations a |. | (required) path to annotations txt file [example - /data/annotations.txt] }" |
||||
"{ config c | | (required) path to configuration xml file containing parameters for training.[ example - /data/config.xml] }" |
||||
"{ model m | | (required) path to configuration xml file containing parameters for training.[ example - /data/model.dat] }" |
||||
"{ width w | 460 | The width which you want all images to get to scale the annotations. large images are slow to process [default = 460] }" |
||||
"{ height h | 460 | The height which you want all images to get to scale the annotations. large images are slow to process [default = 460] }" |
||||
"{ face_cascade f | | Path to the face cascade xml file which you want to use as a detector}" |
||||
); |
||||
//Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
string directory(parser.get<string>("annotations")); |
||||
//default initialisation
|
||||
Size scale(460,460); |
||||
scale = Size(parser.get<int>("width"),parser.get<int>("height")); |
||||
if (directory.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the directory from which annotations have to be found is empty" << endl; |
||||
return -1; |
||||
} |
||||
string configfile_name(parser.get<string>("config")); |
||||
if (configfile_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "No configuration file name found which contains the parameters for training" << endl; |
||||
return -1; |
||||
} |
||||
string modelfile_name(parser.get<string>("model")); |
||||
if (modelfile_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "No name for the model_file found in which the trained model has to be saved" << endl; |
||||
return -1; |
||||
} |
||||
string cascade_name(parser.get<string>("face_cascade")); |
||||
if (cascade_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the cascade classifier to be loaded to detect faces is not found" << endl; |
||||
return -1; |
||||
} |
||||
//create a vector to store names of files in which annotations
|
||||
//and image names are found
|
||||
/*The format of the file containing annotations should be of following format
|
||||
/data/abc/abc.jpg |
||||
123.45,345.65 |
||||
321.67,543.89 |
||||
The above format is similar to HELEN dataset which is used for training model |
||||
*/ |
||||
vector<String> filenames; |
||||
//reading the files from the given directory
|
||||
glob(directory + "*.txt",filenames); |
||||
//create a pointer to call the base class
|
||||
//pass the face cascade xml file which you want to pass as a detector
|
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<FacemarkKazemi> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector((FN_FaceDetector)myDetector, &face_cascade); |
||||
//create a vector to store image names
|
||||
vector<String> imagenames; |
||||
//create object to get landmarks
|
||||
vector< vector<Point2f> > trainlandmarks,Trainlandmarks; |
||||
//gets landmarks and corresponding image names in both the vectors
|
||||
//vector to store images
|
||||
vector<Mat> trainimages; |
||||
loadTrainingData(filenames,trainlandmarks,imagenames); |
||||
for(unsigned long i=0;i<300;i++){ |
||||
string imgname = imagenames[i].substr(0, imagenames[i].size()-1); |
||||
string img = directory + string(imgname) + ".jpg"; |
||||
Mat src = imread(img); |
||||
if(src.empty()){ |
||||
cerr<<string("Image "+img+" not found\n.")<<endl; |
||||
continue; |
||||
} |
||||
trainimages.push_back(src); |
||||
Trainlandmarks.push_back(trainlandmarks[i]); |
||||
} |
||||
cout<<"Got data"<<endl; |
||||
facemark->training(trainimages,Trainlandmarks,configfile_name,scale,modelfile_name); |
||||
cout<<"Training complete"<<endl; |
||||
return 0; |
||||
} |
@ -0,0 +1,134 @@ |
||||
/*----------------------------------------------
|
||||
* the user should provides the list of training images_train |
||||
* accompanied by their corresponding landmarks location in separated files. |
||||
* example of contents for images.txt: |
||||
* ../trainset/image_0001.png |
||||
* ../trainset/image_0002.png |
||||
* example of contents for annotation.txt: |
||||
* ../trainset/image_0001.pts |
||||
* ../trainset/image_0002.pts |
||||
* where the image_xxxx.pts contains the position of each face landmark. |
||||
* example of the contents: |
||||
* version: 1 |
||||
* n_points: 68 |
||||
* { |
||||
* 115.167660 220.807529 |
||||
* 116.164839 245.721357 |
||||
* 120.208690 270.389841 |
||||
* ... |
||||
* } |
||||
* example of the dataset is available at https://ibug.doc.ic.ac.uk/resources/facial-point-annotations/
|
||||
*--------------------------------------------------*/ |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include <iostream> |
||||
#include <vector> |
||||
#include <string> |
||||
|
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
|
||||
static bool myDetector(InputArray image, OutputArray faces, CascadeClassifier *face_cascade) |
||||
{ |
||||
Mat gray; |
||||
|
||||
if (image.channels() > 1) |
||||
cvtColor(image, gray, COLOR_BGR2GRAY); |
||||
else |
||||
gray = image.getMat().clone(); |
||||
|
||||
equalizeHist(gray, gray); |
||||
|
||||
std::vector<Rect> faces_; |
||||
face_cascade->detectMultiScale(gray, faces_, 1.4, 2, CASCADE_SCALE_IMAGE, Size(30, 30)); |
||||
Mat(faces_).copyTo(faces); |
||||
return true; |
||||
} |
||||
|
||||
int main(int argc,char** argv){ |
||||
//Give the path to the directory containing all the files containing data
|
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ images i | | (required) path to images txt file [example - /data/images.txt] }" |
||||
"{ annotations a |. | (required) path to annotations txt file [example - /data/annotations.txt] }" |
||||
"{ config c | | (required) path to configuration xml file containing parameters for training.[example - /data/config.xml] }" |
||||
"{ model m | | (required) path to file containing trained model for face landmark detection[example - /data/model.dat] }" |
||||
"{ width w | 460 | The width which you want all images to get to scale the annotations. large images are slow to process [default = 460] }" |
||||
"{ height h | 460 | The height which you want all images to get to scale the annotations. large images are slow to process [default = 460] }" |
||||
"{ face_cascade f | | Path to the face cascade xml file which you want to use as a detector}" |
||||
); |
||||
// Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
string annotations(parser.get<string>("annotations")); |
||||
string imagesList(parser.get<string>("images")); |
||||
//default initialisation
|
||||
Size scale(460,460); |
||||
scale = Size(parser.get<int>("width"),parser.get<int>("height")); |
||||
if (annotations.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "Name for annotations file not found. Aborting...." << endl; |
||||
return -1; |
||||
} |
||||
if (imagesList.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "Name for file containing image list not found. Aborting....." << endl; |
||||
return -1; |
||||
} |
||||
string configfile_name(parser.get<string>("config")); |
||||
if (configfile_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "No configuration file name found which contains the parameters for training" << endl; |
||||
return -1; |
||||
} |
||||
string modelfile_name(parser.get<string>("model")); |
||||
if (modelfile_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "No name for the model_file found in which the trained model has to be saved" << endl; |
||||
return -1; |
||||
} |
||||
string cascade_name(parser.get<string>("face_cascade")); |
||||
if (cascade_name.empty()){ |
||||
parser.printMessage(); |
||||
cerr << "The name of the cascade classifier to be loaded to detect faces is not found" << endl; |
||||
return -1; |
||||
} |
||||
//create a pointer to call the base class
|
||||
//pass the face cascade xml file which you want to pass as a detector
|
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<FacemarkKazemi> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector((FN_FaceDetector)myDetector, &face_cascade); |
||||
|
||||
std::vector<String> images; |
||||
std::vector<std::vector<Point2f> > facePoints; |
||||
loadTrainingData(imagesList, annotations, images, facePoints, 0.0); |
||||
//gets landmarks and corresponding image names in both the vectors
|
||||
vector<Mat> Trainimages; |
||||
std::vector<std::vector<Point2f> > Trainlandmarks; |
||||
//vector to store images
|
||||
Mat src; |
||||
for(unsigned long i=0;i<images.size();i++){ |
||||
src = imread(images[i]); |
||||
if(src.empty()){ |
||||
cout<<images[i]<<endl; |
||||
cerr<<string("Image not found\n.Aborting...")<<endl; |
||||
continue; |
||||
} |
||||
Trainimages.push_back(src); |
||||
Trainlandmarks.push_back(facePoints[i]); |
||||
} |
||||
cout<<"Got data"<<endl; |
||||
facemark->training(Trainimages,Trainlandmarks,configfile_name,scale,modelfile_name); |
||||
cout<<"Training complete"<<endl; |
||||
return 0; |
||||
} |
@ -0,0 +1,63 @@ |
||||
#include "opencv2/core.hpp" |
||||
#include <iostream> |
||||
#include <string> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
int main(int argc,const char ** argv){ |
||||
CommandLineParser parser(argc, argv, |
||||
"{ help h usage ? | | give the following arguments in following format }" |
||||
"{ filename f |. | (required) path to file which you want to create as config file [example - /data/config.xml] }" |
||||
"{ cascade_depth cd | 10 | (required) This stores the depth of cascade of regressors used for training.}" |
||||
"{ tree_depth td | 4 | (required) This stores the depth of trees created as weak learners during gradient boosting.}" |
||||
"{ num_trees_per_cascade_level| 500 | (required) This stores number of trees required per cascade level.}" |
||||
"{ learning_rate | 0.1 | (required) This stores the learning rate for gradient boosting.}" |
||||
"{ oversampling_amount | 20 | (required) This stores the oversampling amount for the samples.}" |
||||
"{ num_test_coordinates | 400 | (required) This stores number of test coordinates required for making the split.}" |
||||
"{ lambda | 0.1 | (required) This stores the value used for calculating the probabilty.}" |
||||
"{ num_test_splits | 20 | (required) This stores the number of test splits to be generated before making the best split.}" |
||||
); |
||||
// Read in the input arguments
|
||||
if (parser.has("help")){ |
||||
parser.printMessage(); |
||||
cerr << "TIP: Use absolute paths to avoid any problems with the software!" << endl; |
||||
return 0; |
||||
} |
||||
//These variables have been initialised as defined in the research paper "One millisecond face alignment" CVPR 2014
|
||||
int cascade_depth = 15; |
||||
int tree_depth = 4; |
||||
int num_trees_per_cascade_level = 500; |
||||
float learning_rate = float(0.1); |
||||
int oversampling_amount = 20; |
||||
int num_test_coordinates = 400; |
||||
float lambda = float(0.1); |
||||
int num_test_splits = 20; |
||||
|
||||
cascade_depth = parser.get<int>("cascade_depth"); |
||||
tree_depth = parser.get<int>("tree_depth"); |
||||
num_trees_per_cascade_level = parser.get<int>("num_trees_per_cascade_level"); |
||||
learning_rate = parser.get<float>("learning_rate"); |
||||
oversampling_amount = parser.get<int>("oversampling_amount"); |
||||
num_test_coordinates = parser.get<int>("num_test_coordinates"); |
||||
lambda = parser.get<float>("lambda"); |
||||
num_test_splits = parser.get<int>("num_test_splits"); |
||||
string filename(parser.get<string>("filename")); |
||||
FileStorage fs(filename, FileStorage::WRITE); |
||||
if (!fs.isOpened()) |
||||
{ |
||||
cerr << "Failed to open " << filename << endl; |
||||
parser.printMessage(); |
||||
return -1; |
||||
} |
||||
fs << "cascade_depth" << cascade_depth; |
||||
fs << "tree_depth"<< tree_depth; |
||||
fs << "num_trees_per_cascade_level" << num_trees_per_cascade_level; |
||||
fs << "learning_rate" << learning_rate; |
||||
fs << "oversampling_amount" << oversampling_amount; |
||||
fs << "num_test_coordinates" << num_test_coordinates; |
||||
fs << "lambda" << lambda ; |
||||
fs << "num_test_splits"<< num_test_splits; |
||||
fs.release(); |
||||
cout << "Write Done." << endl; |
||||
return 0; |
||||
} |
@ -0,0 +1,190 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "face_alignmentimpl.hpp" |
||||
#include <vector> |
||||
|
||||
using namespace std; |
||||
|
||||
namespace cv{ |
||||
namespace face{ |
||||
|
||||
FacemarkKazemi::~FacemarkKazemi(){} |
||||
FacemarkKazemiImpl:: ~FacemarkKazemiImpl(){} |
||||
unsigned long FacemarkKazemiImpl::left(unsigned long index){ |
||||
return 2*index+1; |
||||
} |
||||
unsigned long FacemarkKazemiImpl::right(unsigned long index){ |
||||
return 2*index+2; |
||||
} |
||||
bool FacemarkKazemiImpl::setFaceDetector(FN_FaceDetector f, void* userData){ |
||||
faceDetector = f; |
||||
faceDetectorData = userData; |
||||
//printf("face detector is configured\n");
|
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::getFaces(InputArray image, OutputArray faces) |
||||
{ |
||||
CV_Assert(faceDetector); |
||||
return faceDetector(image, faces, faceDetectorData); |
||||
} |
||||
FacemarkKazemiImpl::FacemarkKazemiImpl(const FacemarkKazemi::Params& parameters) : |
||||
faceDetector(NULL), |
||||
faceDetectorData(NULL) |
||||
{ |
||||
minmeanx=8000.0; |
||||
maxmeanx=0.0; |
||||
minmeany=8000.0; |
||||
maxmeany=0.0; |
||||
isModelLoaded =false; |
||||
params = parameters; |
||||
} |
||||
FacemarkKazemi::Params::Params(){ |
||||
//These variables are used for training data
|
||||
//These are initialised as described in the research paper
|
||||
//referenced above
|
||||
cascade_depth = 15; |
||||
tree_depth = 5; |
||||
num_trees_per_cascade_level = 500; |
||||
learning_rate = float(0.1); |
||||
oversampling_amount = 20; |
||||
num_test_coordinates = 500; |
||||
lambda = float(0.1); |
||||
num_test_splits = 20; |
||||
} |
||||
bool FacemarkKazemiImpl::convertToActual(Rect r,Mat &warp){ |
||||
Point2f srcTri[3],dstTri[3]; |
||||
srcTri[0]=Point2f(0,0); |
||||
srcTri[1]=Point2f(1,0); |
||||
srcTri[2]=Point2f(0,1); |
||||
dstTri[0]=Point2f((float)r.x,(float)r.y); |
||||
dstTri[1]=Point2f((float)r.x+r.width,(float)r.y); |
||||
dstTri[2]=Point2f((float)r.x,(float)r.y+(float)1.3*r.height); |
||||
warp=getAffineTransform(srcTri,dstTri); |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::convertToUnit(Rect r,Mat &warp){ |
||||
Point2f srcTri[3],dstTri[3]; |
||||
dstTri[0]=Point2f(0,0); |
||||
dstTri[1]=Point2f(1,0); |
||||
dstTri[2]=Point2f(0,1); |
||||
srcTri[0]=Point2f((float)r.x,(float)r.y); |
||||
srcTri[1]=Point2f((float)r.x+r.width,(float)r.y); |
||||
srcTri[2]=Point2f((float)r.x,(float)r.y+(float)1.3*r.height); |
||||
warp=getAffineTransform(srcTri,dstTri); |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::setMeanExtreme(){ |
||||
if(meanshape.empty()){ |
||||
String error_message = "Model not loaded properly.No mean shape found.Aborting..."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
for(size_t i=0;i<meanshape.size();i++){ |
||||
if(meanshape[i].x>maxmeanx) |
||||
maxmeanx = meanshape[i].x; |
||||
if(meanshape[i].x<minmeanx) |
||||
minmeanx = meanshape[i].x; |
||||
if(meanshape[i].y>maxmeany) |
||||
maxmeany = meanshape[i].y; |
||||
if(meanshape[i].y<minmeany) |
||||
minmeany = meanshape[i].y; |
||||
} |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::calcMeanShape (vector< vector<Point2f> >& trainlandmarks,vector<Mat>& trainimages,std::vector<Rect>& faces){ |
||||
//clear the loaded meanshape
|
||||
if(trainimages.empty()||trainlandmarks.size()!=trainimages.size()) { |
||||
// throw error if no data (or simply return -1?)
|
||||
CV_ErrorNoReturn(Error::StsBadArg, "Number of images is not equal to corresponding landmarks. Aborting..."); |
||||
} |
||||
meanshape.clear(); |
||||
vector<Mat> finalimages; |
||||
vector< vector<Point2f> > finallandmarks; |
||||
float xmean[200] = {0.0}; |
||||
//array to store mean of y coordinates
|
||||
float ymean[200] = {0.0}; |
||||
size_t k=0; |
||||
//loop to calculate mean
|
||||
Mat warp_mat,src,C,D; |
||||
vector<Rect> facesp; |
||||
Rect face; |
||||
for(size_t i = 0;i < trainimages.size();i++){ |
||||
src = trainimages[i].clone(); |
||||
//get bounding rectangle of image for reference
|
||||
//function from facemark class
|
||||
facesp.clear(); |
||||
if(!getFaces(src,facesp)){ |
||||
continue; |
||||
} |
||||
if(facesp.size()>1||facesp.empty()) |
||||
continue; |
||||
face = facesp[0]; |
||||
convertToUnit(face,warp_mat); |
||||
//loop to bring points to a common reference and adding
|
||||
for(k=0;k<trainlandmarks[i].size();k++){ |
||||
Point2f pt=trainlandmarks[i][k]; |
||||
C = (Mat_<double>(3,1) << pt.x, pt.y, 1); |
||||
D = warp_mat*C; |
||||
pt.x = float(D.at<double>(0,0)); |
||||
pt.y = float(D.at<double>(1,0)); |
||||
trainlandmarks[i][k] = pt; |
||||
xmean[k] = xmean[k]+pt.x; |
||||
ymean[k] = ymean[k]+pt.y; |
||||
} |
||||
finalimages.push_back(trainimages[i]); |
||||
finallandmarks.push_back(trainlandmarks[i]); |
||||
faces.push_back(face); |
||||
} |
||||
//dividing by size to get mean and initialize meanshape
|
||||
for(size_t i=0;i<k;i++){ |
||||
xmean[i]=xmean[i]/finalimages.size(); |
||||
ymean[i]=ymean[i]/finalimages.size(); |
||||
if(xmean[i]>maxmeanx) |
||||
maxmeanx = xmean[i]; |
||||
if(xmean[i]<minmeanx) |
||||
minmeanx = xmean[i]; |
||||
if(ymean[i]>maxmeany) |
||||
maxmeany = ymean[i]; |
||||
if(ymean[i]<minmeany) |
||||
minmeany = ymean[i]; |
||||
meanshape.push_back(Point2f(xmean[i],ymean[i])); |
||||
} |
||||
trainimages.clear(); |
||||
trainlandmarks.clear(); |
||||
trainimages = finalimages; |
||||
trainlandmarks = finallandmarks; |
||||
finalimages.clear(); |
||||
finallandmarks.clear(); |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::scaleData( vector< vector<Point2f> > & trainlandmarks, |
||||
vector<Mat> & trainimages ,Size s) |
||||
{ |
||||
if(trainimages.empty()||trainimages.size()!=trainlandmarks.size()){ |
||||
// throw error if no data (or simply return -1?)
|
||||
CV_ErrorNoReturn(Error::StsBadArg, "The data is not loaded properly by train function. Aborting..."); |
||||
} |
||||
float scalex,scaley; |
||||
//scale all images and their landmarks according to input size
|
||||
for(size_t i=0;i< trainimages.size();i++){ |
||||
//calculating scale for x and y axis
|
||||
scalex=float(s.width)/float(trainimages[i].cols); |
||||
scaley=float(s.height)/float(trainimages[i].rows); |
||||
resize(trainimages[i],trainimages[i],s); |
||||
for (vector<Point2f>::iterator it = trainlandmarks[i].begin(); it != trainlandmarks[i].end(); it++) { |
||||
Point2f pt = (*it); |
||||
pt.x = pt.x*scalex; |
||||
pt.y = pt.y*scaley; |
||||
(*it) = pt; |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
Ptr<FacemarkKazemi> FacemarkKazemi::create(const FacemarkKazemi::Params ¶meters){ |
||||
return Ptr<FacemarkKazemiImpl>(new FacemarkKazemiImpl(parameters)); |
||||
} |
||||
}//cv
|
||||
}//face
|
@ -0,0 +1,174 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef __OPENCV_FACE_ALIGNMENTIMPL_HPP__ |
||||
#define __OPENCV_FACE_ALIGNMENTIMPL_HPP__ |
||||
#include "opencv2/face.hpp" |
||||
#include <string> |
||||
#include <sstream> |
||||
#include <vector> |
||||
#include <fstream> |
||||
#include <queue> |
||||
#include <algorithm> |
||||
#include <ctime> |
||||
|
||||
using namespace std; |
||||
namespace cv{ |
||||
namespace face{ |
||||
/**@brief structure determining split in regression tree
|
||||
*/ |
||||
struct splitr{ |
||||
//!index1 Index of the first coordinates among the test coordinates for deciding split.
|
||||
uint64_t index1; |
||||
//! index2 index of the second coordinate among the test coordinates for deciding split.
|
||||
uint64_t index2; |
||||
//! thresh threshold for deciding the split.
|
||||
float thresh; |
||||
}; |
||||
/** @brief represents a node of the regression tree*/ |
||||
struct node_info{ |
||||
//First pixel coordinate of split
|
||||
long index1; |
||||
//Second pixel coordinate .split
|
||||
long index2; |
||||
long depth; |
||||
long node_no; |
||||
}; |
||||
/** @brief regression tree structure. Each leaf node is a vector storing residual shape.
|
||||
* The tree is represented as vector of leaves. |
||||
*/ |
||||
struct tree_node{ |
||||
splitr split; |
||||
std::vector<Point2f> leaf; |
||||
}; |
||||
struct regtree{ |
||||
std::vector<tree_node> nodes; |
||||
}; |
||||
/** @brief Represents a training sample
|
||||
*It contains current shape, difference between actual shape |
||||
*and current shape. It also stores the image whose shape is being |
||||
*detected. |
||||
*/ |
||||
struct training_sample{ |
||||
//! shapeResiduals vector which stores the residual shape remaining to be corrected.
|
||||
std::vector<Point2f> shapeResiduals; |
||||
//! current_shape vector containing current estimate of the shape
|
||||
std::vector<Point2f> current_shape; |
||||
//! actual_shape vector containing the actual shape of the face or the ground truth.
|
||||
std::vector<Point2f> actual_shape; |
||||
//! image A mat object which stores the image.
|
||||
Mat image ; |
||||
//! pixel_intensities vector containing pixel intensities of the coordinates chosen for testing
|
||||
std::vector<int> pixel_intensities; |
||||
//! pixel_coordinates vector containing pixel coordinates used for testing
|
||||
std::vector<Point2f> pixel_coordinates; |
||||
//! bound Rectangle enclosing the face found in the image for training
|
||||
Rect bound; |
||||
}; |
||||
class FacemarkKazemiImpl : public FacemarkKazemi{ |
||||
|
||||
public: |
||||
FacemarkKazemiImpl(const FacemarkKazemi::Params& parameters); |
||||
void loadModel(String fs); |
||||
bool setFaceDetector(FN_FaceDetector f, void* userdata); |
||||
bool getFaces(InputArray image, OutputArray faces); |
||||
bool fit(InputArray image, InputArray faces, InputOutputArray landmarks ); |
||||
void training(String imageList, String groundTruth); |
||||
bool training(vector<Mat>& images, vector< vector<Point2f> >& landmarks,string filename,Size scale,string modelFilename); |
||||
// Destructor for the class.
|
||||
virtual ~FacemarkKazemiImpl(); |
||||
|
||||
protected: |
||||
FacemarkKazemi::Params params; |
||||
float minmeanx; |
||||
float maxmeanx; |
||||
float minmeany; |
||||
float maxmeany; |
||||
bool isModelLoaded; |
||||
/* meanshape This is a vector which stores the mean shape of all the images used in training*/ |
||||
std::vector<Point2f> meanshape; |
||||
std::vector< std::vector<regtree> > loaded_forests; |
||||
std::vector< std::vector<Point2f> > loaded_pixel_coordinates; |
||||
FN_FaceDetector faceDetector; |
||||
void* faceDetectorData; |
||||
bool findNearestLandmarks(std::vector< std::vector<int> >& nearest); |
||||
/*Extract left node of the current node in the regression tree*/ |
||||
unsigned long left(unsigned long index); |
||||
// Extract the right node of the current node in the regression tree
|
||||
unsigned long right(unsigned long index); |
||||
// This function randomly generates test splits to get the best split.
|
||||
splitr getTestSplits(std::vector<Point2f> pixel_coordinates,int seed); |
||||
// This function writes a split node to the XML file storing the trained model
|
||||
void writeSplit(std::ofstream& os,const splitr split); |
||||
// This function writes a leaf node to the binary file storing the trained model
|
||||
void writeLeaf(std::ofstream& os, const std::vector<Point2f> &leaf); |
||||
// This function writes a tree to the binary file containing the model
|
||||
void writeTree(std::ofstream &f,regtree tree); |
||||
// This function saves the pixel coordinates to a binary file
|
||||
void writePixels(std::ofstream& f,int index); |
||||
// This function saves model to the binary file
|
||||
bool saveModel(String filename); |
||||
// This funcrion reads pixel coordinates from the model file
|
||||
void readPixels(std::ifstream& is,uint64_t index); |
||||
//This function reads the split node of the tree from binary file
|
||||
void readSplit(std::ifstream& is, splitr &vec); |
||||
//This function reads a leaf node of the tree.
|
||||
void readLeaf(std::ifstream& is, std::vector<Point2f> &leaf); |
||||
/* This function generates pixel intensities of the randomly generated test coordinates used to decide the split.
|
||||
*/ |
||||
bool getPixelIntensities(Mat img,std::vector<Point2f> pixel_coordinates_,std::vector<int>& pixel_intensities_,Rect face); |
||||
//This function initialises the training parameters.
|
||||
bool setTrainingParameters(String filename); |
||||
//This function finds a warp matrix that warp the pixels from the normalised space to the actual space
|
||||
bool convertToActual(Rect r,Mat &warp); |
||||
//This function finds a warp matrix that warps the pixels from the actual space to normaluised space
|
||||
bool convertToUnit(Rect r,Mat &warp); |
||||
/** @brief This function calculates mean shape while training.
|
||||
* This function is only called when new training data is supplied by the train function. |
||||
*@param trainlandmarks A vector of type cv::Point2f which stores the landmarks of corresponding images. |
||||
*@param trainimages A vector of type cv::Mat which stores the images which serve as training data. |
||||
*@param faces A vector of type cv::Rect which stores the bounding recatngle of each training image |
||||
*@returns A boolean value. It returns true if mean shape is found successfully else returns false. |
||||
*/ |
||||
bool calcMeanShape(std::vector< std::vector<Point2f> > & trainlandmarks,std::vector<Mat>& trainimages,std::vector<Rect>& faces); |
||||
/** @brief This functions scales the annotations to a common size which is considered same for all images.
|
||||
* @param trainlandmarks A vector of type cv::Point2f stores the landmarks of the corresponding training images. |
||||
* @param trainimages A vector of type cv::Mat which stores the images which are to be scaled. |
||||
* @param s A variable of type cv::Size stores the common size to which all the images are scaled. |
||||
* @returns A boolean value. It returns true when data is scaled properly else returns false. |
||||
*/ |
||||
bool scaleData(std::vector< std::vector<Point2f> >& trainlandmarks, |
||||
std::vector<Mat>& trainimages , Size s=Size(460,460) ); |
||||
// This function gets the landmarks in the meanshape nearest to the pixel coordinates.
|
||||
unsigned long getNearestLandmark (Point2f pixels ); |
||||
// This function gets the relative position of the test pixel coordinates relative to the current shape.
|
||||
bool getRelativePixels(std::vector<Point2f> sample,std::vector<Point2f>& pixel_coordinates , std::vector<int> nearest_landmark = std::vector<int>()); |
||||
// This function partitions samples according to the split
|
||||
unsigned long divideSamples (splitr split,std::vector<training_sample>& samples,unsigned long start,unsigned long end); |
||||
// This function fits a regression tree according to the shape residuals calculated to give weak learners for GBT algorithm.
|
||||
bool buildRegtree(regtree &tree,std::vector<training_sample>& samples,std::vector<Point2f> pixel_coordinates); |
||||
// This function greedily decides the best split among the test splits generated.
|
||||
bool getBestSplit(std::vector<Point2f> pixel_coordinates, std::vector<training_sample>& samples,unsigned long start , |
||||
unsigned long end,splitr& split,std::vector< std::vector<Point2f> >& sum,long node_no); |
||||
// This function randomly generates test coordinates for each level of cascade.
|
||||
void getTestCoordinates (); |
||||
// This function implements gradient boosting by fitting regression trees
|
||||
std::vector<regtree> gradientBoosting(std::vector<training_sample>& samples,std::vector<Point2f> pixel_coordinates); |
||||
// This function creates training sample by randomly assigning a current shape from set of shapes available.
|
||||
void createLeafNode(regtree& tree,long node_no,std::vector<Point2f> assign); |
||||
// This function creates a split node in the regression tree.
|
||||
void createSplitNode(regtree& tree, splitr split,long node_no); |
||||
// This function prepares the training samples
|
||||
bool createTrainingSamples(std::vector<training_sample> &samples,std::vector<Mat> images,std::vector< std::vector<Point2f> > landmarks, |
||||
std::vector<Rect> rectangle); |
||||
//This function generates a split
|
||||
bool generateSplit(std::queue<node_info>& curr,std::vector<Point2f> pixel_coordinates, std::vector<training_sample>& samples, |
||||
splitr &split , std::vector< std::vector<Point2f> >& sum); |
||||
bool setMeanExtreme(); |
||||
//friend class getRelShape;
|
||||
friend class getRelPixels; |
||||
}; |
||||
}//face
|
||||
}//cv
|
||||
#endif |
@ -0,0 +1,244 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "face_alignmentimpl.hpp" |
||||
#include <fstream> |
||||
#include <ctime> |
||||
|
||||
using namespace std; |
||||
namespace cv{ |
||||
namespace face{ |
||||
bool FacemarkKazemiImpl :: findNearestLandmarks( vector< vector<int> >& nearest){ |
||||
if(meanshape.empty()||loaded_pixel_coordinates.empty()){ |
||||
String error_message = "Model not loaded properly.Aborting..."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
nearest.resize(loaded_pixel_coordinates.size()); |
||||
for(unsigned long i=0 ; i< loaded_pixel_coordinates.size(); i++){ |
||||
for(unsigned long j = 0;j<loaded_pixel_coordinates[i].size();j++){ |
||||
nearest[i].push_back(getNearestLandmark(loaded_pixel_coordinates[i][j])); |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
void FacemarkKazemiImpl :: readSplit(ifstream& is, splitr &vec) |
||||
{ |
||||
is.read((char*)&vec, sizeof(splitr)); |
||||
} |
||||
void FacemarkKazemiImpl :: readLeaf(ifstream& is, vector<Point2f> &leaf) |
||||
{ |
||||
uint64_t size; |
||||
is.read((char*)&size, sizeof(size)); |
||||
leaf.resize((size_t)size); |
||||
is.read((char*)&leaf[0], leaf.size() * sizeof(Point2f)); |
||||
} |
||||
void FacemarkKazemiImpl :: readPixels(ifstream& is,uint64_t index) |
||||
{ |
||||
is.read((char*)&loaded_pixel_coordinates[(unsigned long)index][0], loaded_pixel_coordinates[(unsigned long)index].size() * sizeof(Point2f)); |
||||
} |
||||
void FacemarkKazemiImpl :: loadModel(String filename){ |
||||
if(filename.empty()){ |
||||
String error_message = "No filename found.Aborting...."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
ifstream f(filename.c_str(),ios::binary); |
||||
if(!f.is_open()){ |
||||
String error_message = "No file with given name found.Aborting...."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
uint64_t len; |
||||
f.read((char*)&len, sizeof(len)); |
||||
char* temp = new char[(size_t)len+1]; |
||||
f.read(temp, len); |
||||
temp[len] = '\0'; |
||||
string s(temp); |
||||
delete [] temp; |
||||
if(s.compare("cascade_depth")!=0){ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
uint64_t cascade_size; |
||||
f.read((char*)&cascade_size,sizeof(cascade_size)); |
||||
loaded_forests.resize((unsigned long)cascade_size); |
||||
f.read((char*)&len, sizeof(len)); |
||||
temp = new char[(unsigned long)len+1]; |
||||
f.read(temp, len); |
||||
temp[len] = '\0'; |
||||
s = string(temp); |
||||
delete [] temp; |
||||
if(s.compare("pixel_coordinates")!=0){ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
loaded_pixel_coordinates.resize((unsigned long)cascade_size); |
||||
uint64_t num_pixels; |
||||
f.read((char*)&num_pixels,sizeof(num_pixels)); |
||||
for(unsigned long i=0 ; i < cascade_size ; i++){ |
||||
loaded_pixel_coordinates[i].resize((unsigned long)num_pixels); |
||||
readPixels(f,i); |
||||
} |
||||
f.read((char*)&len, sizeof(len)); |
||||
temp = new char[(unsigned long)len+1]; |
||||
f.read(temp, len); |
||||
temp[len] = '\0'; |
||||
s = string(temp); |
||||
delete [] temp; |
||||
if(s.compare("mean_shape")!=0){ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
uint64_t mean_shape_size; |
||||
f.read((char*)&mean_shape_size,sizeof(mean_shape_size)); |
||||
meanshape.resize((unsigned long)mean_shape_size); |
||||
f.read((char*)&meanshape[0], meanshape.size() * sizeof(Point2f)); |
||||
if(!setMeanExtreme()) |
||||
exit(0); |
||||
f.read((char*)&len, sizeof(len)); |
||||
temp = new char[(unsigned long)len+1]; |
||||
f.read(temp, len); |
||||
temp[len] = '\0'; |
||||
s = string(temp); |
||||
delete [] temp; |
||||
if(s.compare("num_trees")!=0){ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
uint64_t num_trees; |
||||
f.read((char*)&num_trees,sizeof(num_trees)); |
||||
for(unsigned long i=0;i<cascade_size;i++){ |
||||
for(unsigned long j=0;j<num_trees;j++){ |
||||
regtree tree; |
||||
f.read((char*)&len, sizeof(len)); |
||||
char* temp2 = new char[(unsigned long)len+1]; |
||||
f.read(temp2, len); |
||||
temp2[len] = '\0'; |
||||
s =string(temp2); |
||||
delete [] temp2; |
||||
if(s.compare("num_nodes")!=0){ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
uint64_t num_nodes; |
||||
f.read((char*)&num_nodes,sizeof(num_nodes)); |
||||
tree.nodes.resize((unsigned long)num_nodes+1); |
||||
for(unsigned long k=0; k < num_nodes ; k++){ |
||||
f.read((char*)&len, sizeof(len)); |
||||
char* temp3 = new char[(unsigned long)len+1]; |
||||
f.read(temp3, len); |
||||
temp3[len] = '\0'; |
||||
s =string(temp3); |
||||
delete [] temp3; |
||||
tree_node node; |
||||
if(s.compare("split")==0){ |
||||
splitr split; |
||||
readSplit(f,split); |
||||
node.split = split; |
||||
node.leaf.clear(); |
||||
} |
||||
else if(s.compare("leaf")==0){ |
||||
vector<Point2f> leaf; |
||||
readLeaf(f,leaf); |
||||
node.leaf = leaf; |
||||
} |
||||
else{ |
||||
String error_message = "Data not saved properly.Aborting....."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
tree.nodes[k]=node; |
||||
} |
||||
loaded_forests[i].push_back(tree); |
||||
} |
||||
} |
||||
f.close(); |
||||
isModelLoaded = true; |
||||
} |
||||
bool FacemarkKazemiImpl::fit(InputArray img, InputArray roi, InputOutputArray landmarks){ |
||||
if(!isModelLoaded){ |
||||
String error_message = "No model loaded. Aborting...."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
Mat image = img.getMat(); |
||||
std::vector<Rect> & faces = *(std::vector<Rect>*)roi.getObj(); |
||||
std::vector<std::vector<Point2f> > & shapes = *(std::vector<std::vector<Point2f> >*) landmarks.getObj(); |
||||
shapes.resize(faces.size()); |
||||
|
||||
if(image.empty()){ |
||||
String error_message = "No image found.Aborting.."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(faces.empty()){ |
||||
String error_message = "No faces found.Aborting.."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(meanshape.empty()||loaded_forests.empty()||loaded_pixel_coordinates.empty()){ |
||||
String error_message = "Model not loaded properly.Aborting..."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(loaded_forests.size()==0){ |
||||
String error_message = "Model not loaded properly.Aboerting..."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(loaded_pixel_coordinates.size()==0){ |
||||
String error_message = "Model not loaded properly.Aboerting..."; |
||||
CV_Error(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
vector< vector<int> > nearest_landmarks; |
||||
findNearestLandmarks(nearest_landmarks); |
||||
tree_node curr_node; |
||||
vector<Point2f> pixel_relative; |
||||
vector<int> pixel_intensity; |
||||
Mat warp_mat; |
||||
for(size_t e=0;e<faces.size();e++){ |
||||
shapes[e]=meanshape; |
||||
convertToActual(faces[e],warp_mat); |
||||
for(size_t i=0;i<loaded_forests.size();i++){ |
||||
pixel_intensity.clear(); |
||||
pixel_relative = loaded_pixel_coordinates[i]; |
||||
getRelativePixels(shapes[e],pixel_relative,nearest_landmarks[i]); |
||||
getPixelIntensities(image,pixel_relative,pixel_intensity,faces[e]); |
||||
for(size_t j=0;j<loaded_forests[i].size();j++){ |
||||
regtree tree = loaded_forests[i][j]; |
||||
curr_node = tree.nodes[0]; |
||||
unsigned long curr_node_index = 0; |
||||
while(curr_node.leaf.size()==0) |
||||
{ |
||||
if ((float)pixel_intensity[(unsigned long)curr_node.split.index1] - (float)pixel_intensity[(unsigned long)curr_node.split.index2] > curr_node.split.thresh) |
||||
{ |
||||
curr_node_index=left(curr_node_index); |
||||
} else |
||||
curr_node_index=right(curr_node_index); |
||||
curr_node = tree.nodes[curr_node_index]; |
||||
} |
||||
for(size_t p=0;p<curr_node.leaf.size();p++){ |
||||
shapes[e][p]=shapes[e][p] + curr_node.leaf[p]; |
||||
} |
||||
} |
||||
} |
||||
for(unsigned long j=0;j<shapes[e].size();j++){ |
||||
Mat C = (Mat_<double>(3,1) << shapes[e][j].x, shapes[e][j].y, 1); |
||||
Mat D = warp_mat*C; |
||||
shapes[e][j].x=float(D.at<double>(0,0)); |
||||
shapes[e][j].y=float(D.at<double>(1,0)); |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
}//cv
|
||||
}//face
|
@ -0,0 +1,313 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "face_alignmentimpl.hpp" |
||||
|
||||
using namespace std; |
||||
|
||||
namespace cv{ |
||||
namespace face{ |
||||
//Threading helper classes
|
||||
class doSum : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
doSum(vector<training_sample>* samples_,vector<Point2f>* sum_) : |
||||
samples(samples_), |
||||
sum(sum_) |
||||
{ |
||||
} |
||||
virtual void operator()( const Range& range) const |
||||
{ |
||||
for (int j = range.start; j < range.end; ++j){ |
||||
for(unsigned long k=0;k<(*samples)[j].shapeResiduals.size();k++){ |
||||
(*sum)[k]=(*sum)[k]+(*samples)[j].shapeResiduals[k]; |
||||
} |
||||
} |
||||
} |
||||
private: |
||||
vector<training_sample>* samples; |
||||
vector<Point2f>* sum; |
||||
}; |
||||
class modifySamples : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
modifySamples(vector<training_sample>* samples_,vector<Point2f>* temp_) : |
||||
samples(samples_), |
||||
temp(temp_) |
||||
{ |
||||
} |
||||
virtual void operator()( const Range& range) const |
||||
{ |
||||
for (int j = range.start; j < range.end; ++j){ |
||||
for(unsigned long k=0;k<(*samples)[j].shapeResiduals.size();k++){ |
||||
(*samples)[j].shapeResiduals[k]=(*samples)[j].shapeResiduals[k]-(*temp)[k]; |
||||
(*samples)[j].current_shape[k]=(*samples)[j].actual_shape[k]-(*samples)[j].shapeResiduals[k]; |
||||
} |
||||
} |
||||
} |
||||
private: |
||||
vector<training_sample>* samples; |
||||
vector<Point2f>* temp; |
||||
}; |
||||
class splitSamples : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
splitSamples(vector<training_sample>* samples_,vector< vector<Point2f> >* leftsumresiduals_,vector<unsigned long>* left_count_,unsigned long* num_test_splits_,vector<splitr>* feats_) : |
||||
samples(samples_), |
||||
leftsumresiduals(leftsumresiduals_), |
||||
left_count(left_count_), |
||||
num_test_splits(num_test_splits_), |
||||
feats(feats_) |
||||
{ |
||||
} |
||||
virtual void operator()( const Range& range) const |
||||
{ |
||||
for (int i = range.start; i < range.end; ++i){ |
||||
for(unsigned long j=0;j<*(num_test_splits);j++){ |
||||
(*left_count)[j]++; |
||||
if ((float)(*samples)[i].pixel_intensities[(unsigned long)(*feats)[j].index1] - (float)(*samples)[i].pixel_intensities[(unsigned long)(*feats)[j].index2] > (*feats)[j].thresh){ |
||||
for(unsigned long k=0;k<(*samples)[i].shapeResiduals.size();k++){ |
||||
(*leftsumresiduals)[j][k]=(*leftsumresiduals)[j][k]+(*samples)[i].shapeResiduals[k]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
private: |
||||
vector<training_sample>* samples; |
||||
vector< vector<Point2f> >* leftsumresiduals; |
||||
vector<unsigned long>* left_count; |
||||
unsigned long* num_test_splits; |
||||
vector<splitr>* feats; |
||||
}; |
||||
splitr FacemarkKazemiImpl::getTestSplits(vector<Point2f> pixel_coordinates,int seed) |
||||
{ |
||||
splitr feat; |
||||
//generates splits whose probability is above a particular threshold.
|
||||
//P(u,v)=e^(-distance/lambda) as described in the research paper
|
||||
//cited above. This helps to select closer pixels hence make efficient
|
||||
//splits.
|
||||
double probability; |
||||
double check; |
||||
RNG rng(seed); |
||||
do |
||||
{ |
||||
//select random pixel coordinate
|
||||
feat.index1 = rng.uniform(0,params.num_test_coordinates); |
||||
//select another random coordinate
|
||||
feat.index2 = rng.uniform(0,params.num_test_coordinates); |
||||
Point2f pt = pixel_coordinates[(unsigned long)feat.index1]-pixel_coordinates[(unsigned long)feat.index2]; |
||||
double distance = sqrt((pt.x*pt.x)+(pt.y*pt.y)); |
||||
//calculate the probability
|
||||
probability = exp(-distance/params.lambda); |
||||
check = rng.uniform(double(0),double(1)); |
||||
} |
||||
while(check>probability||feat.index1==feat.index2); |
||||
feat.thresh =(float)(((rng.uniform(double(0),double(1)))*256 - 128)/2.0); |
||||
return feat; |
||||
} |
||||
bool FacemarkKazemiImpl:: getBestSplit(vector<Point2f> pixel_coordinates, vector<training_sample>& samples,unsigned long start , |
||||
unsigned long end,splitr& split,vector< vector<Point2f> >& sum,long node_no) |
||||
{ |
||||
if(samples[0].shapeResiduals.size()!=samples[0].current_shape.size()){ |
||||
String error_message = "Error while generating split.Residuals are not complete.Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
//This vector stores the matrices where each matrix represents
|
||||
//sum of the residuals of shapes of samples which go to the left
|
||||
//child after split
|
||||
vector< vector<Point2f> > leftsumresiduals; |
||||
leftsumresiduals.resize(params.num_test_splits); |
||||
vector<splitr> feats; |
||||
//generate random splits and selects the best split amongst them.
|
||||
for (unsigned long i = 0; i < params.num_test_splits; ++i){ |
||||
feats.push_back(getTestSplits(pixel_coordinates,i+(int)time(0))); |
||||
leftsumresiduals[i].resize(samples[0].shapeResiduals.size()); |
||||
} |
||||
vector<unsigned long> left_count; |
||||
left_count.resize(params.num_test_splits); |
||||
parallel_for_(Range(start,end),splitSamples(&samples,&leftsumresiduals,&left_count,¶ms.num_test_splits,&feats)); |
||||
//Selecting the best split
|
||||
double best_score =-1; |
||||
unsigned long best_feat = 0; |
||||
double score = -1; |
||||
vector<Point2f> right_sum; |
||||
right_sum.resize(sum[node_no].size()); |
||||
vector<Point2f> left_sum; |
||||
left_sum.resize(sum[node_no].size()); |
||||
unsigned long right_cnt; |
||||
for(unsigned long i=0;i<leftsumresiduals.size();i++){ |
||||
right_cnt = (end-start+1)-left_count[i]; |
||||
for(unsigned long k=0;k<leftsumresiduals[i].size();k++){ |
||||
if (right_cnt!=0){ |
||||
right_sum[k].x=(sum[node_no][k].x-leftsumresiduals[i][k].x)/right_cnt; |
||||
right_sum[k].y=(sum[node_no][k].y-leftsumresiduals[i][k].y)/right_cnt; |
||||
} |
||||
else |
||||
right_sum[k]=Point2f(0,0); |
||||
if(left_count[i]!=0){ |
||||
left_sum[k].x=leftsumresiduals[i][k].x/left_count[i]; |
||||
left_sum[k].y=leftsumresiduals[i][k].y/left_count[i]; |
||||
} |
||||
else |
||||
left_sum[k]=Point2f(0,0); |
||||
} |
||||
Point2f pt1(0,0); |
||||
Point2f pt2(0,0); |
||||
for(unsigned long k=0;k<left_sum.size();k++){ |
||||
pt1.x = pt1.x + (float)(left_sum[k].x*left_sum[k].x); |
||||
pt2.x = pt2.x + (float)(right_sum[k].x*right_sum[k].x); |
||||
pt1.y = pt1.y + (float)(left_sum[k].y*left_sum[k].y); |
||||
pt2.y = pt2.y + (float)(right_sum[k].y*right_sum[k].y); |
||||
} |
||||
score = (double)sqrt(pt1.x+pt1.y)*(double)left_count[i] + (double)sqrt(pt2.x+pt2.y)*(double)right_cnt; |
||||
if(score > best_score){ |
||||
best_score = score; |
||||
best_feat = i; |
||||
} |
||||
} |
||||
sum[2*node_no+1] = leftsumresiduals[best_feat]; |
||||
sum[2*node_no+2].resize(sum[node_no].size()); |
||||
for(unsigned long k=0;k<sum[node_no].size();k++){ |
||||
sum[2*node_no+2][k].x = sum[node_no][k].x-sum[2*node_no+1][k].x; |
||||
sum[2*node_no+2][k].y = sum[node_no][k].y-sum[2*node_no+1][k].y; |
||||
} |
||||
split = feats[best_feat]; |
||||
return true; |
||||
} |
||||
void FacemarkKazemiImpl::createSplitNode(regtree& tree, splitr split,long node_no){ |
||||
tree_node node; |
||||
node.split = split; |
||||
node.leaf.clear(); |
||||
tree.nodes[node_no]=node; |
||||
} |
||||
void FacemarkKazemiImpl::createLeafNode(regtree& tree,long node_no,vector<Point2f> assign){ |
||||
tree_node node; |
||||
node.split.index1 = (uint64_t)(-1); |
||||
node.split.index2 = (uint64_t)(-1); |
||||
node.leaf = assign; |
||||
tree.nodes[node_no] = node; |
||||
} |
||||
bool FacemarkKazemiImpl :: generateSplit(queue<node_info>& curr,vector<Point2f> pixel_coordinates, vector<training_sample>& samples, |
||||
splitr &split , vector< vector<Point2f> >& sum){ |
||||
|
||||
long start = curr.front().index1; |
||||
long end = curr.front().index2; |
||||
long _depth = curr.front().depth; |
||||
long node_no =curr.front().node_no; |
||||
curr.pop(); |
||||
if(start == end) |
||||
return false; |
||||
getBestSplit(pixel_coordinates,samples,start,end,split,sum,node_no); |
||||
long mid = divideSamples(split, samples, start, end); |
||||
//cout<<mid<<endl;
|
||||
if(mid==start||mid==end+1) |
||||
return false; |
||||
node_info _left,_right; |
||||
_left.index1 = start; |
||||
_left.index2 = mid-1; |
||||
_left.depth = _depth +1; |
||||
_left.node_no = 2*node_no+1; |
||||
_right.index1 = mid; |
||||
_right.index2 = end; |
||||
_right.depth = _depth +1; |
||||
_right.node_no = 2*node_no+2; |
||||
curr.push(_left); |
||||
curr.push(_right); |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl :: buildRegtree(regtree& tree,vector<training_sample>& samples,vector<Point2f> pixel_coordinates){ |
||||
if(samples.size()==0){ |
||||
String error_message = "Error while building regression tree.Empty samples. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(pixel_coordinates.size()==0){ |
||||
String error_message = "Error while building regression tree.No pixel coordinates. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
queue<node_info> curr; |
||||
node_info parent; |
||||
vector< vector<Point2f> > sum; |
||||
const long numNodes =(long)pow(2,params.tree_depth); |
||||
const long numSplitNodes = numNodes/2 - 1; |
||||
sum.resize(numNodes+1); |
||||
sum[0].resize(samples[0].shapeResiduals.size()); |
||||
parallel_for_(cv::Range(0,(int)samples.size()), doSum(&(samples),&(sum[0]))); |
||||
parent.index1=0; |
||||
parent.index2=(long)samples.size()-1; |
||||
parent.node_no=0; |
||||
parent.depth=0; |
||||
curr.push(parent); |
||||
tree.nodes.resize(numNodes+1); |
||||
//Total number of split nodes
|
||||
while(!curr.empty()){ |
||||
pair<long,long> range= make_pair(curr.front().index1,curr.front().index2); |
||||
long node_no = curr.front().node_no; |
||||
splitr split; |
||||
//generate a split
|
||||
if(node_no<=numSplitNodes){ |
||||
if(generateSplit(curr,pixel_coordinates,samples,split,sum)){ |
||||
createSplitNode(tree,split,node_no); |
||||
} |
||||
//create leaf
|
||||
else{ |
||||
long count = range.second-range.first +1; |
||||
vector<Point2f> temp; |
||||
temp.resize(samples[range.first].shapeResiduals.size()); |
||||
parallel_for_(Range(range.first, range.second), doSum(&(samples),&(temp))); |
||||
for(unsigned long k=0;k<temp.size();k++){ |
||||
temp[k].x=(temp[k].x/count)*params.learning_rate; |
||||
temp[k].y=(temp[k].y/count)*params.learning_rate; |
||||
} |
||||
// Modify current shape according to the weak learners.
|
||||
parallel_for_(Range(range.first,range.second), modifySamples(&(samples),&(temp))); |
||||
createLeafNode(tree,node_no,temp); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
unsigned long count = range.second-range.first +1; |
||||
vector<Point2f> temp; |
||||
temp.resize(samples[range.first].shapeResiduals.size()); |
||||
parallel_for_(Range(range.first, range.second), doSum(&(samples),&(temp))); |
||||
for(unsigned long k=0;k<temp.size();k++){ |
||||
temp[k].x=(temp[k].x/count)*params.learning_rate; |
||||
temp[k].y=(temp[k].y/count)*params.learning_rate; |
||||
} |
||||
// Modify current shape according to the weak learners.
|
||||
parallel_for_(Range(range.first,range.second), modifySamples(&(samples),&(temp))); |
||||
createLeafNode(tree,node_no,temp); |
||||
curr.pop(); |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
unsigned long FacemarkKazemiImpl::divideSamples (splitr split,vector<training_sample>& samples,unsigned long start,unsigned long end) |
||||
{ |
||||
if(samples.size()==0){ |
||||
String error_message = "Error while dividing samples. Sample array empty. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return 0; |
||||
} |
||||
unsigned long i = start; |
||||
training_sample temp; |
||||
//partition samples according to the split
|
||||
for (unsigned long j = start; j < end; ++j) |
||||
{ |
||||
if ((float)samples[j].pixel_intensities[(unsigned long)split.index1] - (float)samples[j].pixel_intensities[(unsigned long)split.index2] > split.thresh) |
||||
{ |
||||
temp=samples[i]; |
||||
samples[i]=samples[j]; |
||||
samples[j]=temp; |
||||
++i; |
||||
} |
||||
} |
||||
return i; |
||||
} |
||||
}//cv
|
||||
}//face
|
@ -0,0 +1,348 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "face_alignmentimpl.hpp" |
||||
#include "opencv2/video/tracking.hpp" |
||||
#include <climits> |
||||
|
||||
using namespace std; |
||||
namespace cv{ |
||||
namespace face{ |
||||
// Threading helper classes
|
||||
class getDiffShape : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
getDiffShape(vector<training_sample>* samples_) : |
||||
samples(samples_) |
||||
{ |
||||
} |
||||
virtual void operator()( const cv::Range& range) const |
||||
{ |
||||
for(size_t j = (size_t)range.start; j < (size_t)range.end; ++j){ |
||||
(*samples)[j].shapeResiduals.resize((*samples)[j].current_shape.size()); |
||||
for(unsigned long k=0;k<(*samples)[j].current_shape.size();k++) |
||||
(*samples)[j].shapeResiduals[k]=(*samples)[j].actual_shape[k]-(*samples)[j].current_shape[k]; |
||||
} |
||||
} |
||||
private: |
||||
vector<training_sample>* samples; |
||||
}; |
||||
class getRelPixels : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
getRelPixels(vector<training_sample>* samples_,FacemarkKazemiImpl& object_) : |
||||
samples(samples_), |
||||
object(object_) |
||||
{ |
||||
} |
||||
virtual void operator()( const cv::Range& range) const |
||||
{ |
||||
for (size_t j = (size_t)range.start; j < (size_t)range.end; ++j){ |
||||
object.getRelativePixels(((*samples)[j]).current_shape,((*samples)[j]).pixel_coordinates); |
||||
} |
||||
} |
||||
private: |
||||
vector<training_sample>* samples; |
||||
FacemarkKazemiImpl& object; |
||||
}; |
||||
//This function initialises the training parameters.
|
||||
bool FacemarkKazemiImpl::setTrainingParameters(String filename){ |
||||
cout << "Reading Training Parameters " << endl; |
||||
FileStorage fs; |
||||
fs.open(filename, FileStorage::READ); |
||||
if (!fs.isOpened()) |
||||
{ String error_message = "Error while opening configuration file.Aborting.."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
int cascade_depth_; |
||||
int tree_depth_; |
||||
int num_trees_per_cascade_level_; |
||||
float learning_rate_; |
||||
int oversampling_amount_; |
||||
int num_test_coordinates_; |
||||
float lambda_; |
||||
int num_test_splits_; |
||||
fs["cascade_depth"]>> cascade_depth_; |
||||
fs["tree_depth"]>> tree_depth_; |
||||
fs["num_trees_per_cascade_level"] >> num_trees_per_cascade_level_; |
||||
fs["learning_rate"] >> learning_rate_; |
||||
fs["oversampling_amount"] >> oversampling_amount_; |
||||
fs["num_test_coordinates"] >> num_test_coordinates_; |
||||
fs["lambda"] >> lambda_; |
||||
fs["num_test_splits"] >> num_test_splits_; |
||||
params.cascade_depth = (unsigned long)cascade_depth_; |
||||
params.tree_depth = (unsigned long) tree_depth_; |
||||
params.num_trees_per_cascade_level = (unsigned long) num_trees_per_cascade_level_; |
||||
params.learning_rate = (float) learning_rate_; |
||||
params.oversampling_amount = (unsigned long) oversampling_amount_; |
||||
params.num_test_coordinates = (unsigned long) num_test_coordinates_; |
||||
params.lambda = (float) lambda_; |
||||
params.num_test_splits = (unsigned long) num_test_splits_; |
||||
fs.release(); |
||||
cout<<"Parameters loaded"<<endl; |
||||
return true; |
||||
} |
||||
void FacemarkKazemiImpl::getTestCoordinates () |
||||
{ |
||||
for(unsigned long i = 0; i < params.cascade_depth; ++i){ |
||||
vector<Point2f> temp; |
||||
RNG rng = theRNG(); |
||||
for(unsigned long j = 0; j < params.num_test_coordinates; ++j) |
||||
{ |
||||
Point2f pt; |
||||
pt.x = (float)rng.uniform(minmeanx,maxmeanx); |
||||
pt.y = (float)rng.uniform(minmeany,maxmeany); |
||||
temp.push_back(pt); |
||||
} |
||||
loaded_pixel_coordinates.push_back(temp); |
||||
} |
||||
} |
||||
unsigned long FacemarkKazemiImpl:: getNearestLandmark(Point2f pixel) |
||||
{ |
||||
if(meanshape.empty()) { |
||||
// throw error if no data (or simply return -1?)
|
||||
String error_message = "The data is not loaded properly by train function. Aborting..."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
float dist=float(INT_MAX); |
||||
unsigned long index =0; |
||||
for(unsigned long i=0;i<meanshape.size();i++){ |
||||
Point2f pt = meanshape[i]-pixel; |
||||
if(sqrt(pt.x*pt.x+pt.y*pt.y)<dist){ |
||||
dist=sqrt(pt.x*pt.x+pt.y*pt.y); |
||||
index = i; |
||||
} |
||||
} |
||||
return index; |
||||
} |
||||
bool FacemarkKazemiImpl :: getRelativePixels(vector<Point2f> sample,vector<Point2f>& pixel_coordinates,std::vector<int> nearest){ |
||||
if(sample.size()!=meanshape.size()){ |
||||
String error_message = "Error while finding relative shape. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
Mat transform_mat; |
||||
transform_mat = estimateRigidTransform(meanshape,sample,false); |
||||
unsigned long index; |
||||
for (unsigned long i = 0;i<pixel_coordinates.size();i++) { |
||||
if(!nearest.empty()) |
||||
index = nearest[i]; |
||||
index = getNearestLandmark(pixel_coordinates[i]); |
||||
pixel_coordinates[i] = pixel_coordinates[i] - meanshape[index]; |
||||
Mat C = (Mat_<double>(3,1) << pixel_coordinates[i].x, pixel_coordinates[i].y, 0); |
||||
if(!transform_mat.empty()){ |
||||
Mat D =transform_mat*C; |
||||
pixel_coordinates[i].x = float((D.at<double>(0,0))); |
||||
pixel_coordinates[i].y = float((D.at<double>(1,0))); |
||||
} |
||||
pixel_coordinates[i] = pixel_coordinates[i] + sample[index]; |
||||
} |
||||
return true; |
||||
} |
||||
bool FacemarkKazemiImpl::getPixelIntensities(Mat img,vector<Point2f> pixel_coordinates,vector<int>& pixel_intensities,Rect face){ |
||||
if(pixel_coordinates.size()==0){ |
||||
String error_message = "No pixel coordinates found. Aborting....."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
Mat transform_mat; |
||||
convertToActual(face,transform_mat); |
||||
Mat dst = img.clone(); |
||||
Mat C,D; |
||||
for(size_t j=0;j<pixel_coordinates.size();j++){ |
||||
C = (Mat_<double>(3,1) << pixel_coordinates[j].x, pixel_coordinates[j].y, 1); |
||||
D = transform_mat*C; |
||||
pixel_coordinates[j].x = float(D.at<double>(0,0)); |
||||
pixel_coordinates[j].y = float(D.at<double>(1,0)); |
||||
} |
||||
int val; |
||||
for(unsigned long j=0;j<pixel_coordinates.size();j++){ |
||||
if(pixel_coordinates[j].x>0&&pixel_coordinates[j].x<img.cols&&pixel_coordinates[j].y>0&&pixel_coordinates[j].y<img.rows){ |
||||
Vec3b val1 = img.at<Vec3b>((int)pixel_coordinates[j].y,(int)pixel_coordinates[j].x); |
||||
val = (int)(val1[0]+val1[1]+val1[2])/3; |
||||
} |
||||
else |
||||
val = 0; |
||||
pixel_intensities.push_back(val); |
||||
} |
||||
return true; |
||||
} |
||||
vector<regtree> FacemarkKazemiImpl::gradientBoosting(vector<training_sample>& samples,vector<Point2f> pixel_coordinates){ |
||||
vector<regtree> forest; |
||||
vector<Point2f> meanresidual; |
||||
meanresidual.resize(samples[0].shapeResiduals.size()); |
||||
for(unsigned long i=0;i<samples.size();i++){ |
||||
for(unsigned long j=0;j<samples[i].shapeResiduals.size();j++){ |
||||
meanresidual[j]=meanresidual[j]+samples[i].shapeResiduals[j]; |
||||
} |
||||
} |
||||
for(unsigned long i=0;i<meanresidual.size();i++){ |
||||
meanresidual[i].x=(meanresidual[i].x)/samples.size(); |
||||
meanresidual[i].y=(meanresidual[i].y)/samples.size(); |
||||
} |
||||
for(unsigned long i=0;i<samples.size();i++){ |
||||
for(unsigned long j=0;j<samples[i].shapeResiduals.size();j++) |
||||
samples[i].shapeResiduals[j]=samples[i].shapeResiduals[j]-meanresidual[j]; |
||||
} |
||||
for(unsigned long i=0;i<params.num_trees_per_cascade_level;i++){ |
||||
regtree tree; |
||||
buildRegtree(tree,samples,pixel_coordinates); |
||||
forest.push_back(tree); |
||||
} |
||||
return forest; |
||||
} |
||||
bool FacemarkKazemiImpl::createTrainingSamples(vector<training_sample> &samples,vector<Mat> images,vector< vector<Point2f> > landmarks,vector<Rect> rectangle){ |
||||
unsigned long in=0; |
||||
samples.resize(params.oversampling_amount*images.size()); |
||||
for(unsigned long i=0;i<images.size();i++){ |
||||
for(unsigned long j=0;j<params.oversampling_amount;j++){ |
||||
samples[in].image=images[i]; |
||||
samples[in].actual_shape = landmarks[i]; |
||||
samples[in].bound = rectangle[i]; |
||||
unsigned long rindex=i; |
||||
if(in%2==0) |
||||
samples[in].current_shape = meanshape; |
||||
else{ |
||||
RNG rng(in); |
||||
rindex =(unsigned long)rng.uniform(0,(int)landmarks.size()-1); |
||||
samples[in].current_shape = landmarks[rindex]; |
||||
} |
||||
in++; |
||||
} |
||||
} |
||||
parallel_for_(Range(0,(int)samples.size()),getDiffShape(&samples)); |
||||
return true; |
||||
} |
||||
void FacemarkKazemiImpl :: writeLeaf(ofstream& os, const vector<Point2f> &leaf) |
||||
{ |
||||
uint64_t size = leaf.size(); |
||||
os.write((char*)&size, sizeof(size)); |
||||
os.write((char*)&leaf[0], leaf.size() * sizeof(Point2f)); |
||||
} |
||||
void FacemarkKazemiImpl :: writeSplit(ofstream& os, splitr split) |
||||
{ |
||||
os.write((char*)&split, sizeof(split)); |
||||
} |
||||
void FacemarkKazemiImpl :: writeTree(ofstream &f,regtree tree) |
||||
{ |
||||
string s("num_nodes"); |
||||
uint64_t len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
uint64_t num_nodes = tree.nodes.size(); |
||||
f.write((char*)&num_nodes,sizeof(num_nodes)); |
||||
for(size_t i=0;i<tree.nodes.size();i++){ |
||||
if(tree.nodes[i].leaf.empty()){ |
||||
s = string("split"); |
||||
len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
writeSplit(f,tree.nodes[i].split); |
||||
} |
||||
else{ |
||||
s = string("leaf"); |
||||
len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
writeLeaf(f,tree.nodes[i].leaf); |
||||
} |
||||
} |
||||
} |
||||
void FacemarkKazemiImpl :: writePixels(ofstream& f,int index){ |
||||
f.write((char*)&loaded_pixel_coordinates[index][0], loaded_pixel_coordinates[index].size() * sizeof(Point2f)); |
||||
} |
||||
bool FacemarkKazemiImpl :: saveModel(String filename){ |
||||
ofstream f(filename.c_str(),ios::binary); |
||||
if(!f.is_open()){ |
||||
String error_message = "Error while opening file to write model. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
if(loaded_forests.size()!=loaded_pixel_coordinates.size()){ |
||||
String error_message = "Incorrect training data. Aborting...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
string s("cascade_depth"); |
||||
uint64_t len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
uint64_t cascade_size = loaded_forests.size(); |
||||
f.write((char*)&cascade_size,sizeof(cascade_size)); |
||||
s = string("pixel_coordinates"); |
||||
len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
uint64_t num_pixels = loaded_pixel_coordinates[0].size(); |
||||
f.write((char*)&num_pixels,sizeof(num_pixels)); |
||||
for(unsigned long i=0;i< loaded_pixel_coordinates.size();i++){ |
||||
writePixels(f,i); |
||||
} |
||||
s = string("mean_shape"); |
||||
uint64_t len1 = s.size(); |
||||
f.write((char*)&len1, sizeof(len1)); |
||||
f.write(s.c_str(), len1); |
||||
uint64_t mean_shape_size = meanshape.size(); |
||||
f.write((char*)&mean_shape_size,sizeof(mean_shape_size)); |
||||
f.write((char*)&meanshape[0], meanshape.size() * sizeof(Point2f)); |
||||
s = string("num_trees"); |
||||
len = s.size(); |
||||
f.write((char*)&len, sizeof(len)); |
||||
f.write(s.c_str(), len); |
||||
uint64_t num_trees = loaded_forests[0].size(); |
||||
f.write((char*)&num_trees,sizeof(num_trees)); |
||||
for(unsigned long i=0 ; i<loaded_forests.size() ; i++){ |
||||
for(unsigned long j=0 ; j<loaded_forests[i].size() ; j++){ |
||||
writeTree(f,loaded_forests[i][j]); |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
void FacemarkKazemiImpl::training(String imageList, String groundTruth){ |
||||
imageList.clear(); |
||||
groundTruth.clear(); |
||||
String error_message = "Less arguments than required"; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return ; |
||||
} |
||||
bool FacemarkKazemiImpl::training(vector<Mat>& images, vector< vector<Point2f> >& landmarks,string filename,Size scale,string modelFilename){ |
||||
if(!setTrainingParameters(filename)){ |
||||
String error_message = "Error while loading training parameters"; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
vector<Rect> rectangles; |
||||
scaleData(landmarks,images,scale); |
||||
calcMeanShape(landmarks,images,rectangles); |
||||
if(images.size()!=landmarks.size()){ |
||||
// throw error if no data (or simply return -1?)
|
||||
String error_message = "The data is not loaded properly. Aborting training function...."; |
||||
CV_ErrorNoReturn(Error::StsBadArg, error_message); |
||||
return false; |
||||
} |
||||
vector<training_sample> samples; |
||||
getTestCoordinates(); |
||||
createTrainingSamples(samples,images,landmarks,rectangles); |
||||
images.clear(); |
||||
landmarks.clear(); |
||||
rectangles.clear(); |
||||
for(unsigned long i=0;i< params.cascade_depth;i++){ |
||||
cout<<"Training regressor "<<i<<"..."<<endl; |
||||
for (std::vector<training_sample>::iterator it = samples.begin(); it != samples.end(); it++) { |
||||
(*it).pixel_coordinates = loaded_pixel_coordinates[i]; |
||||
} |
||||
parallel_for_(Range(0,(int)samples.size()),getRelPixels(&samples,*this)); |
||||
for (std::vector<training_sample>::iterator it = samples.begin(); it != samples.end(); it++) { |
||||
getPixelIntensities((*it).image,(*it).pixel_coordinates,(*it).pixel_intensities,(*it).bound); |
||||
} |
||||
loaded_forests.push_back(gradientBoosting(samples,loaded_pixel_coordinates[i])); |
||||
} |
||||
saveModel(modelFilename); |
||||
return true; |
||||
} |
||||
}//cv
|
||||
}//face
|
@ -0,0 +1,97 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "test_precomp.hpp" |
||||
#include "opencv2/imgcodecs.hpp" |
||||
#include "opencv2/face.hpp" |
||||
#include "opencv2/objdetect.hpp" |
||||
#include <vector> |
||||
#include <string> |
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::face; |
||||
|
||||
static bool myDetector( InputArray image, OutputArray ROIs, CascadeClassifier* face_cascade) |
||||
{ |
||||
Mat gray; |
||||
std::vector<Rect> faces; |
||||
if(image.channels()>1){ |
||||
cvtColor(image.getMat(),gray,COLOR_BGR2GRAY); |
||||
} |
||||
else{ |
||||
gray = image.getMat().clone(); |
||||
} |
||||
equalizeHist( gray, gray ); |
||||
face_cascade->detectMultiScale( gray, faces, 1.1, 3, 0, Size(30, 30) ); |
||||
Mat(faces).copyTo(ROIs); |
||||
return true; |
||||
} |
||||
|
||||
TEST(CV_Face_FacemarkKazemi, can_create_default) { |
||||
string cascade_name = cvtest::findDataFile("face/lbpcascade_frontalface_improved.xml", true); |
||||
string configfile_name = cvtest::findDataFile("face/config.xml", true); |
||||
CascadeClassifier face_cascade; |
||||
EXPECT_TRUE(face_cascade.load(cascade_name)); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<FacemarkKazemi> facemark; |
||||
EXPECT_NO_THROW(facemark = FacemarkKazemi::create(params)); |
||||
EXPECT_TRUE(facemark->setFaceDetector((cv::face::FN_FaceDetector)myDetector, &face_cascade)); |
||||
EXPECT_FALSE(facemark.empty()); |
||||
} |
||||
|
||||
TEST(CV_Face_FacemarkKazemi, can_loadTrainingData) { |
||||
string filename = cvtest::findDataFile("face/lbpcascade_frontalface_improved.xml", true); |
||||
string configfile_name = cvtest::findDataFile("face/config.xml", true); |
||||
CascadeClassifier face_cascade; |
||||
EXPECT_TRUE(face_cascade.load(filename)); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<FacemarkKazemi> facemark; |
||||
EXPECT_NO_THROW(facemark = FacemarkKazemi::create(params)); |
||||
EXPECT_TRUE(facemark->setFaceDetector((cv::face::FN_FaceDetector)myDetector, &face_cascade)); |
||||
vector<String> filenames; |
||||
filename = cvtest::findDataFile("face/1.txt", true); |
||||
filenames.push_back(filename); |
||||
filename = cvtest::findDataFile("face/2.txt", true); |
||||
filenames.push_back(filename); |
||||
vector<String> imagenames; |
||||
vector< vector<Point2f> > trainlandmarks,Trainlandmarks; |
||||
vector<Rect> rectangles; |
||||
//Test getData function
|
||||
EXPECT_NO_THROW(loadTrainingData(filenames,trainlandmarks,imagenames)); |
||||
vector<Mat> trainimages; |
||||
for(unsigned long i=0;i<imagenames.size();i++){ |
||||
string img = cvtest::findDataFile(imagenames[i], true); |
||||
Mat src = imread(img); |
||||
EXPECT_TRUE(!src.empty()); |
||||
trainimages.push_back(src); |
||||
Trainlandmarks.push_back(trainlandmarks[i]); |
||||
} |
||||
string modelfilename = "face_landmark_model.dat"; |
||||
Size scale = Size(460,460); |
||||
EXPECT_TRUE(facemark->training(trainimages,Trainlandmarks,configfile_name,scale,modelfilename)); |
||||
} |
||||
TEST(CV_Face_FacemarkKazemi, can_detect_landmarks) { |
||||
string cascade_name = cvtest::findDataFile("face/lbpcascade_frontalface_improved.xml", true); |
||||
CascadeClassifier face_cascade; |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
Ptr<FacemarkKazemi> facemark; |
||||
EXPECT_NO_THROW(facemark = FacemarkKazemi::create(params)); |
||||
EXPECT_TRUE(facemark->setFaceDetector((cv::face::FN_FaceDetector)myDetector, &face_cascade)); |
||||
string imgname = cvtest::findDataFile("face/detect.jpg"); |
||||
string modelfilename = cvtest::findDataFile("face/face_landmark_model.dat",true); |
||||
Mat img = imread(imgname); |
||||
EXPECT_TRUE(!img.empty()); |
||||
EXPECT_FALSE(facemark.empty()); |
||||
EXPECT_NO_THROW(facemark->loadModel(modelfilename)); |
||||
vector<Rect> faces; |
||||
//Detect faces in the current image
|
||||
EXPECT_TRUE(facemark->getFaces(img,faces)); |
||||
//vector to store the landmarks of all the faces in the image
|
||||
vector< vector<Point2f> > shapes; |
||||
EXPECT_NO_THROW(facemark->fit(img,faces,shapes)); |
||||
shapes.clear(); |
||||
} |
@ -0,0 +1,97 @@ |
||||
Face landmark detection in an image {#tutorial_face_landmark_detection_in_an_image} |
||||
=================================== |
||||
|
||||
 |
||||
|
||||
This application lets you detect landmarks of detected faces in an image. You can detect landmarks of all the faces found in an image |
||||
and use them further in various applications like face swapping, face averaging etc. |
||||
This functionality is now available in OpenCV. |
||||
|
||||
``` |
||||
// Command to be typed for running the sample |
||||
./sampleDetectLandmarks -file=trained_model.dat -face_cascade=lbpcascadefrontalface.xml -image=/path_to_image/image.jpg |
||||
``` |
||||
### Description of command parameters {tutorial_face_training_parameters} |
||||
|
||||
> * **model_filename** f : (REQUIRED) A path to binary file storing the trained model which is to be loaded [example - /data/file.dat] |
||||
> * **image** i : (REQUIRED) A path to image in which face landmarks have to be detected.[example - /data/image.jpg] |
||||
> * **face_cascade** c : (REQUIRED) A path to the face cascade xml file which you want to use as a face detector. |
||||
|
||||
Understanding code |
||||
------------------ |
||||
|
||||
 |
||||
|
||||
This tutorial will explain the sample code for face landmark detection. Jumping directly to the code : |
||||
|
||||
``` c++ |
||||
CascadeClassifier face_cascade; |
||||
bool myDetector( InputArray image, OutputArray ROIs ); |
||||
|
||||
bool myDetector( InputArray image, OutputArray ROIs ){ |
||||
Mat gray; |
||||
std::vector<Rect> faces; |
||||
if(image.channels()>1){ |
||||
cvtColor(image.getMat(),gray,CV_BGR2GRAY); |
||||
} |
||||
else{ |
||||
gray = image.getMat().clone(); |
||||
} |
||||
equalizeHist( gray, gray ); |
||||
face_cascade.detectMultiScale( gray, faces, 1.1, 3,0, Size(30, 30) ); |
||||
Mat(faces).copyTo(ROIs); |
||||
return true; |
||||
} |
||||
``` |
||||
|
||||
The facemark API provides the functionality to the user to use their own face detector to be used in training.The above code creartes a sample face detector. The above function would be passed to a function pointer in the facemark API. |
||||
|
||||
``` c++ |
||||
Mat img = imread(image); |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<Facemark> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector(myDetector); |
||||
|
||||
``` |
||||
The above code creates a pointer of the face landmark detection class. The face detector created above has to be passed |
||||
as function pointer to the facemark pointer created for detecting faces while landmark detection. The above code also loads the image |
||||
in which landmarks have to be detected. |
||||
|
||||
``` c++ |
||||
facemark->loadModel(filename); |
||||
cout<<"Loaded model"<<endl; |
||||
vector<Rect> faces; |
||||
resize(img,img,Size(460,460)); |
||||
facemark->getFaces(img,faces); |
||||
vector< vector<Point2f> > shapes; |
||||
|
||||
``` c++ |
||||
The above code loads a trained model for face landmark detection and creates a vector to store the detected faces. It then resizes the image to a smaller size as processing speed is faster with small images. It then creates a vector of vector to store shapes for each face detected. |
||||
|
||||
``` c++ |
||||
if(facemark->fit(img,faces,shapes)) |
||||
{ |
||||
for( size_t i = 0; i < faces.size(); i++ ) |
||||
{ |
||||
cv::rectangle(img,faces[i],Scalar( 255, 0, 0 )); |
||||
} |
||||
for(unsigned long i=0;i<faces.size();i++){ |
||||
for(unsigned long k=0;k<shapes[i].size();k++) |
||||
cv::circle(img,shapes[i][k],5,cv::Scalar(0,0,255),FILLED); |
||||
} |
||||
namedWindow("Detected_shape"); |
||||
imshow("Detected_shape",img); |
||||
waitKey(0); |
||||
} |
||||
``` |
||||
|
||||
The above code then calls the function fit to get shapes of all detected faces in the image |
||||
and then draws the rectangles bounding the faces and marks the desired landmarks. |
||||
|
||||
### Detection Results |
||||
|
||||
 |
||||
|
||||
 |
@ -0,0 +1,177 @@ |
||||
 |
||||
|
||||
Training face landmark detector{#tutorial_face_training_face_landmark_detector} |
||||
============================== |
||||
|
||||
This application helps to train your own face landmark detector. You can train your own face landmark detection by just providing the paths for |
||||
directory containing the images and files containing their corresponding face landmarks. As this landmark detector was originally trained on |
||||
[HELEN dataset](http://www.ifp.illinois.edu/~vuongle2/helen/), the training follows the format of data provided in HELEN dataset. |
||||
|
||||
The dataset consists of .txt files whose first line contains the image name which then follows the annotations. |
||||
The format of the file containing annotations should be of following format : |
||||
> /directory/images/abc.jpg |
||||
> 123.45,345.65 |
||||
> 321.67,543.89 |
||||
> .... , .... |
||||
> .... , .... |
||||
The above format is similar to HELEN dataset which is used for training the model. |
||||
|
||||
``` |
||||
// Command to be typed for running the sample |
||||
./sample_train_landmark_detector -annotations=/home/sukhad/Downloads/code/trainset/ -config=config.xml -face_cascade=lbpcascadefrontalface.xml -model=trained_model.dat -width=460 -height=460 |
||||
``` |
||||
|
||||
### Description of command parameters |
||||
|
||||
> * **annotations** a : (REQUIRED) Path to annotations txt file [example - /data/annotations.txt] |
||||
> * **config** c : (REQUIRED) Path to configuration xml file containing parameters for training.[ example - /data/config.xml] |
||||
> * **model** m : (REQUIRED) Path to configuration xml file containing parameters for training.[ example - /data/model.dat] |
||||
> * **width** w : (OPTIONAL) The width which you want all images to get to scale the annotations. Large images are slow to process [default = 460] |
||||
> * **height** h : (OPTIONAL) The height which you want all images to get to scale the annotations. Large images are slow to process [default = 460] |
||||
> * **face_cascade** f (REQUIRED) Path to the face cascade xml file which you want to use as a detector. |
||||
|
||||
### Description of training parameters |
||||
|
||||
|
||||
The configuration file described above which is used while training contains the training parameters which are required for training. |
||||
|
||||
**The description of parameters is as follows :** |
||||
|
||||
1. **Cascade depth :** This stores the depth of cascade of regressors used for training. |
||||
2. **Tree depth :** This stores the depth of trees created as weak learners during gradient boosting. |
||||
3. **Number of trees per cascade level :** This stores number of trees required per cascade level. |
||||
4. **Learning rate :** This stores the learning rate for gradient boosting.This is required to prevent overfitting using shrinkage. |
||||
5. **Oversampling amount :** This stores the oversampling amount for the samples. |
||||
6. **Number of test coordinates :** This stores number of test coordinates to be generated as samples to decide for making the split. |
||||
7. **Lambda :** This stores the value used for calculating the probabilty which helps to select closer pixels for making the split. |
||||
8. **Number of test splits :** This stores the number of test splits to be generated before making the best split. |
||||
|
||||
|
||||
To get more detailed description about the training parameters you can refer to the [Research paper](https://pdfs.semanticscholar.org/d78b/6a5b0dcaa81b1faea5fb0000045a62513567.pdf). |
||||
|
||||
### Understanding code |
||||
|
||||
|
||||
 |
||||
|
||||
|
||||
Jumping directly to the code : |
||||
|
||||
``` c++ |
||||
CascadeClassifier face_cascade; |
||||
bool myDetector( InputArray image, OutputArray ROIs ); |
||||
|
||||
bool myDetector( InputArray image, OutputArray ROIs ){ |
||||
Mat gray; |
||||
std::vector<Rect> faces; |
||||
if(image.channels()>1){ |
||||
cvtColor(image.getMat(),gray,CV_BGR2GRAY); |
||||
} |
||||
else{ |
||||
gray = image.getMat().clone(); |
||||
} |
||||
equalizeHist( gray, gray ); |
||||
face_cascade.detectMultiScale( gray, faces, 1.1, 3,0, Size(30, 30) ); |
||||
Mat(faces).copyTo(ROIs); |
||||
return true; |
||||
} |
||||
``` |
||||
The facemark API provides the functionality to the user to use their own face detector to be used in training.The above code creartes a sample face detector. The above function would be passed to a function pointer in the facemark API. |
||||
|
||||
``` c++ |
||||
vector<String> filenames; |
||||
glob(directory,filenames); |
||||
``` |
||||
The above code creates a vector filenames for storing the names of the .txt files. |
||||
It gets the filenames of the files in the directory. |
||||
|
||||
``` c++ |
||||
Mat img = imread(image); |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<Facemark> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector(myDetector); |
||||
|
||||
``` |
||||
The above code creates a pointer of the face landmark detection class. The face detector created above has to be passed |
||||
as function pointer to the facemark pointer created for detecting faces while training the model. |
||||
|
||||
``` c++ |
||||
vector<String> imagenames; |
||||
vector< vector<Point2f> > trainlandmarks,Trainlandmarks; |
||||
vector<Mat> trainimages; |
||||
loadTrainingData(filenames,trainlandmarks,imagenames); |
||||
for(unsigned long i=0;i<300;i++){ |
||||
string imgname = imagenames[i].substr(0, imagenames[i].size()-1); |
||||
string img = directory + string(imgname) + ".jpg"; |
||||
Mat src = imread(img); |
||||
if(src.empty()){ |
||||
cerr<<string("Image "+img+" not found\n.")<<endl; |
||||
continue; |
||||
} |
||||
trainimages.push_back(src); |
||||
Trainlandmarks.push_back(trainlandmarks[i]); |
||||
} |
||||
``` |
||||
The above code creates std::vectors to store the images and their corresponding landmarks. |
||||
The above code calls a function loadTrainingData to load the landmarks and the images into their respective vectors. |
||||
|
||||
If the dataset you downloaded is of the following format : |
||||
``` |
||||
version: 1 |
||||
n_points: 68 |
||||
{ |
||||
115.167660 220.807529 |
||||
116.164839 245.721357 |
||||
120.208690 270.389841 |
||||
... |
||||
} |
||||
This is the example of the dataset available at https://ibug.doc.ic.ac.uk/resources/facial-point-annotations/ |
||||
|
||||
``` |
||||
|
||||
Then skip the above code for loading training data and use the following code. This sample is provided as sampleTrainLandmarkDetector2.cpp |
||||
in the face module in opencv contrib. |
||||
|
||||
``` c++ |
||||
std::vector<String> images; |
||||
std::vector<std::vector<Point2f> > facePoints; |
||||
loadTrainingData(imagesList, annotations, images, facePoints, 0.0); |
||||
``` |
||||
|
||||
In the above code imagelist and annotations are the file of following format : |
||||
``` |
||||
example of contents for images.txt: |
||||
../trainset/image_0001.png |
||||
../trainset/image_0002.png |
||||
example of contents for annotation.txt: |
||||
../trainset/image_0001.pts |
||||
../trainset/image_0002.pts |
||||
``` |
||||
|
||||
These symbolize the names of images and their corresponding annotations. |
||||
|
||||
The above code scales images and landmarks as training on images of smaller size takes less time. |
||||
This is because processing larger images requires more time. After scaling data it calculates mean |
||||
shape of the data which is used as initial shape while training. |
||||
|
||||
Finally call the following function to perform training : |
||||
|
||||
``` c++ |
||||
facemark->training(Trainimages,Trainlandmarks,configfile_name,scale,modelfile_name); |
||||
``` |
||||
In the above function scale is passed to scale all images and the corresponding landmarks so that the size of all |
||||
images can be reduced as it takes greater time to process large images. |
||||
This call to the train function trains the model and stores the trained model file with the given |
||||
filename specified.As the training starts successfully you will see something like this : |
||||
 |
||||
|
||||
|
||||
**The error rate on trained images depends on the number of images used for training used as follows :** |
||||
|
||||
 |
||||
|
||||
**The error rate on test images depends on the number of images used for training used as follows :** |
||||
|
||||
 |
@ -0,0 +1,110 @@ |
||||
Face landmark detection in a video{#tutorial_face_landmark_detection_in_video} |
||||
=================================== |
||||
|
||||
This application lets you detect landmarks of detected faces in a video.This application first detects faces in a current video frame |
||||
and then finds their facial landmarks. You just have to pass the video as input. |
||||
``` |
||||
// Command to be typed for running the sample |
||||
./sampleDetectLandmarks -file=trained_model.dat -face_cascade=lbpcascadefrontalface.xml -video=/path_to_video/video.avi |
||||
``` |
||||
Description of command parameters |
||||
--------------------------------- |
||||
|
||||
> * **model_filename** f : (REQUIRED) A path to binary file storing the trained model which is to be loaded [example - /data/file.dat] |
||||
> * **video** v : (REQUIRED) A path to video in which face landmarks have to be detected.[example - /data/video.avi] |
||||
> * **face_cascade** c : (REQUIRED) A path to the face cascade xml file which you want to use as a face detector. |
||||
|
||||
### Understanding code |
||||
|
||||
This tutorial will explain the sample code for face landmark detection. Jumping directly to the code : |
||||
|
||||
``` c++ |
||||
CascadeClassifier face_cascade; |
||||
bool myDetector( InputArray image, OutputArray ROIs ); |
||||
|
||||
bool myDetector( InputArray image, OutputArray ROIs ){ |
||||
Mat gray; |
||||
std::vector<Rect> faces; |
||||
if(image.channels()>1){ |
||||
cvtColor(image.getMat(),gray,CV_BGR2GRAY); |
||||
} |
||||
else{ |
||||
gray = image.getMat().clone(); |
||||
} |
||||
equalizeHist( gray, gray ); |
||||
face_cascade.detectMultiScale( gray, faces, 1.1, 3,0, Size(30, 30) ); |
||||
Mat(faces).copyTo(ROIs); |
||||
return true; |
||||
} |
||||
``` |
||||
The facemark API provides the functionality to the user to use their own face detector to be used in face landmark detection.The above code creartes a sample face detector. The above function would be passed to a function pointer in the facemark API. |
||||
|
||||
``` c++ |
||||
VideoCapture cap(video); |
||||
if(!cap.isOpened()){ |
||||
cerr<<"Video cannot be loaded. Give correct path"<<endl; |
||||
return -1; |
||||
} |
||||
``` |
||||
|
||||
The above code creates a video capture object and then loads the video. |
||||
If the video is not loaded properly it prompts the user else the code proceeds. |
||||
|
||||
``` c++ |
||||
Mat img = imread(image); |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<Facemark> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector(myDetector); |
||||
|
||||
``` |
||||
The above code creates a pointer of the face landmark detection class. The face detector created above has to be passed |
||||
as function pointer to the facemark pointer created for detecting faces. |
||||
``` c++ |
||||
vector<Rect> faces; |
||||
vector< vector<Point2f> > shapes; |
||||
Mat img; |
||||
``` |
||||
The above code creates a vector to store the detected faces and a vector of vector to store shapes for each |
||||
face detected in the current frame. |
||||
|
||||
``` c++ |
||||
while(1){ |
||||
faces.clear(); |
||||
shapes.clear(); |
||||
cap>>img; |
||||
resize(img,img,Size(600,600)); |
||||
facemark->getFaces(img,faces); |
||||
if(faces.size()==0){ |
||||
cout<<"No faces found in this frame"<<endl; |
||||
} |
||||
else{ |
||||
for( size_t i = 0; i < faces.size(); i++ ) |
||||
{ |
||||
cv::rectangle(img,faces[i],Scalar( 255, 0, 0 )); |
||||
} |
||||
if(facemark->fit(img,faces,shapes)) |
||||
{ |
||||
for(unsigned long i=0;i<faces.size();i++){ |
||||
for(unsigned long k=0;k<shapes[i].size();k++) |
||||
cv::circle(img,shapes[i][k],3,cv::Scalar(0,0,255),FILLED); |
||||
} |
||||
} |
||||
} |
||||
namedWindow("Detected_shape"); |
||||
imshow("Detected_shape",img); |
||||
if(waitKey(1) >= 0) break; |
||||
} |
||||
``` |
||||
|
||||
The above code then reads each frame and detects faces and the landmarks corresponding to each shape detected. |
||||
It then displays the current frame. |
||||
|
||||
After running the above code you will get results something like this |
||||
|
||||
Sample video: |
||||
|
||||
@htmlonly |
||||
<iframe width="560" height="315" src="https://www.youtube.com/embed/ZtaV07T90D8" frameborder="0" allowfullscreen></iframe> |
||||
@endhtmlonly |
After Width: | Height: | Size: 221 KiB |
After Width: | Height: | Size: 95 KiB |
After Width: | Height: | Size: 270 KiB |
After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 37 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 64 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 25 KiB |
After Width: | Height: | Size: 25 KiB |
After Width: | Height: | Size: 11 KiB |
@ -0,0 +1,147 @@ |
||||
Face swapping using face landmark detection{#tutorial_face_swapping_face_landmark_detection} |
||||
=========================================== |
||||
|
||||
This application lets you swap a face in one image with another face in other image. The application first detects faces in both images and finds its landmarks. Then it swaps the face in first image with in another image. You just have to give paths to the images run the application to swap the two faces. |
||||
``` |
||||
// Command to be typed for running the sample |
||||
./sample_face_swapping -file=trained_model.dat -face_cascade=lbpcascadefrontalface.xml -image1=/path_to_image/image1.jpg -image2=/path_to_image/image2.jpg |
||||
``` |
||||
### Description of command parameters |
||||
|
||||
> * **image1** i1 (REQUIRED) Path to the first image file in which you want to apply swapping. |
||||
> * **image2** i2 (REQUIRED) Path to the second image file in which you want to apply face swapping. |
||||
> * **model** m (REQUIRED) Path to the file containing model to be loaded for face landmark detection. |
||||
> * **face_cascade** f (REQUIRED) Path to the face cascade xml file which you want to use as a face detector. |
||||
|
||||
### Understanding the code |
||||
|
||||
This tutorial will explain the sample code for face swapping using OpenCV. Jumping directly to the code : |
||||
|
||||
``` c++ |
||||
CascadeClassifier face_cascade; |
||||
bool myDetector( InputArray image, OutputArray ROIs ); |
||||
|
||||
bool myDetector( InputArray image, OutputArray ROIs ){ |
||||
Mat gray; |
||||
std::vector<Rect> faces; |
||||
if(image.channels()>1){ |
||||
cvtColor(image.getMat(),gray,CV_BGR2GRAY); |
||||
} |
||||
else{ |
||||
gray = image.getMat().clone(); |
||||
} |
||||
equalizeHist( gray, gray ); |
||||
face_cascade.detectMultiScale( gray, faces, 1.1, 3,0, Size(30, 30) ); |
||||
Mat(faces).copyTo(ROIs); |
||||
return true; |
||||
} |
||||
``` |
||||
The facemark API provides the functionality to the user to use their own face detector to be used in face landmark detection.The above code creartes a sample face detector. The above function would be passed to a function pointer in the facemark API. |
||||
|
||||
|
||||
``` c++ |
||||
Mat img = imread(image); |
||||
face_cascade.load(cascade_name); |
||||
FacemarkKazemi::Params params; |
||||
params.configfile = configfile_name; |
||||
Ptr<Facemark> facemark = FacemarkKazemi::create(params); |
||||
facemark->setFaceDetector(myDetector); |
||||
``` |
||||
The above code creates a pointer of the face landmark detection class. The face detector created above has to be passed |
||||
as function pointer to the facemark pointer created for detecting faces while training the model. |
||||
``` c++ |
||||
vector<Rect> faces1,faces2; |
||||
vector< vector<Point2f> > shape1,shape2; |
||||
float ratio1 = (float)img1.cols/(float)img1.rows; |
||||
float ratio2 = (float)img2.cols/(float)img2.rows; |
||||
resize(img1,img1,Size(640*ratio1,640*ratio1)); |
||||
resize(img2,img2,Size(640*ratio2,640*ratio2)); |
||||
Mat img1Warped = img2.clone(); |
||||
facemark->getFaces(img1,faces1); |
||||
facemark->getFaces(img2,faces2); |
||||
facemark->fit(img1,faces1,shape1); |
||||
facemark->fit(img2,faces2,shape2); |
||||
|
||||
``` |
||||
|
||||
The above code creates vectors to store the detected faces and a vector of vector to store shapes for each |
||||
face detected in both the images.It then detects landmarks of each face detected in both the images.the images are resized |
||||
as it is easier to process small images. The images are resized according their actual ratio. |
||||
|
||||
|
||||
``` c++ |
||||
vector<Point2f> boundary_image1; |
||||
vector<Point2f> boundary_image2; |
||||
vector<int> index; |
||||
convexHull(Mat(points2),index, false, false); |
||||
for(size_t i = 0; i < index.size(); i++) |
||||
{ |
||||
boundary_image1.push_back(points1[index[i]]); |
||||
boundary_image2.push_back(points2[index[i]]); |
||||
} |
||||
``` |
||||
|
||||
The above code then finds convex hull to find the boundary points of the face in the image which has to be swapped. |
||||
|
||||
``` c++ |
||||
vector< vector<int> > triangles; |
||||
Rect rect(0, 0, img1Warped.cols, img1Warped.rows); |
||||
divideIntoTriangles(rect, boundary_image2, triangles); |
||||
for(size_t i = 0; i < triangles.size(); i++) |
||||
{ |
||||
vector<Point2f> triangle1, triangle2; |
||||
for(int j = 0; j < 3; j++) |
||||
{ |
||||
triangle1.push_back(boundary_image1[triangles[i][j]]); |
||||
triangle2.push_back(boundary_image2[triangles[i][j]]); |
||||
} |
||||
warpTriangle(img1, img1Warped, triangle1, triangle2); |
||||
} |
||||
``` |
||||
|
||||
Now as we need to warp one face over the other and we need to find affine transform. |
||||
Now as the function in OpenCV to find affine transform requires three set of points to calculate |
||||
the affine matrix. Also we just need to warp the face instead of the surrounding regions. Hence |
||||
we divide the face into triangles so that each triiangle can be easily warped onto the other image. |
||||
|
||||
The function divideIntoTriangles divides the detected faces into triangles. |
||||
The function warpTriangle then warps each triangle of one image to other image to swap the faces. |
||||
|
||||
``` c++ |
||||
vector<Point> hull; |
||||
for(size_t i = 0; i < boundary_image2.size(); i++) |
||||
{ |
||||
Point pt((int)boundary_image2[i].x,(int)boundary_image2[i].y); |
||||
hull.push_back(pt); |
||||
} |
||||
Mat mask = Mat::zeros(img2.rows, img2.cols, img2.depth()); |
||||
fillConvexPoly(mask,&hull[0],(int)hull.size(), Scalar(255,255,255)); |
||||
Rect r = boundingRect(boundary_image2); |
||||
Point center = (r.tl() + r.br()) / 2; |
||||
Mat output; |
||||
img1Warped.convertTo(img1Warped, CV_8UC3); |
||||
seamlessClone(img1Warped,img2, mask, center, output, NORMAL_CLONE); |
||||
imshow("Face_Swapped", output); |
||||
``` |
||||
|
||||
Even after warping the results somehow look unnatural. Hence to improve the results we apply seamless cloning |
||||
to get the desired results as required. |
||||
|
||||
### Results |
||||
|
||||
Consider two images to be used for face swapping as follows : |
||||
|
||||
First image |
||||
----------- |
||||
|
||||
 |
||||
|
||||
Second image |
||||
------------ |
||||
|
||||
 |
||||
|
||||
Results after swapping |
||||
---------------------- |
||||
|
||||
 |
@ -0,0 +1,699 @@ |
||||
Face Recognition with OpenCV {#tutorial_face_main} |
||||
============================ |
||||
|
||||
[TOC] |
||||
|
||||
Introduction {#tutorial_face_intro} |
||||
============ |
||||
|
||||
[OpenCV (Open Source Computer Vision)](http://opencv.org) is a popular computer vision library |
||||
started by [Intel](http://www.intel.com) in 1999. The cross-platform library sets its focus on |
||||
real-time image processing and includes patent-free implementations of the latest computer vision |
||||
algorithms. In 2008 [Willow Garage](http://www.willowgarage.com) took over support and OpenCV 2.3.1 |
||||
now comes with a programming interface to C, C++, [Python](http://www.python.org) and |
||||
[Android](http://www.android.com). OpenCV is released under a BSD license so it is used in academic |
||||
projects and commercial products alike. |
||||
|
||||
OpenCV 2.4 now comes with the very new FaceRecognizer class for face recognition, so you can start |
||||
experimenting with face recognition right away. This document is the guide I've wished for, when I |
||||
was working myself into face recognition. It shows you how to perform face recognition with |
||||
FaceRecognizer in OpenCV (with full source code listings) and gives you an introduction into the |
||||
algorithms behind. I'll also show how to create the visualizations you can find in many |
||||
publications, because a lot of people asked for. |
||||
|
||||
The currently available algorithms are: |
||||
|
||||
- Eigenfaces (see EigenFaceRecognizer::create) |
||||
- Fisherfaces (see FisherFaceRecognizer::create) |
||||
- Local Binary Patterns Histograms (see LBPHFaceRecognizer::create) |
||||
|
||||
You don't need to copy and paste the source code examples from this page, because they are available |
||||
in the src folder coming with this documentation. If you have built OpenCV with the samples turned |
||||
on, chances are good you have them compiled already! Although it might be interesting for very |
||||
advanced users, I've decided to leave the implementation details out as I am afraid they confuse new |
||||
users. |
||||
|
||||
All code in this document is released under the [BSD |
||||
license](http://www.opensource.org/licenses/bsd-license), so feel free to use it for your projects. |
||||
|
||||
Face Recognition {#tutorial_face_facerec} |
||||
---------------- |
||||
|
||||
Face recognition is an easy task for humans. Experiments in @cite Tu06 have shown, that even one to |
||||
three day old babies are able to distinguish between known faces. So how hard could it be for a |
||||
computer? It turns out we know little about human recognition to date. Are inner features (eyes, |
||||
nose, mouth) or outer features (head shape, hairline) used for a successful face recognition? How do |
||||
we analyze an image and how does the brain encode it? It was shown by [David |
||||
Hubel](http://en.wikipedia.org/wiki/David_H._Hubel) and [Torsten |
||||
Wiesel](http://en.wikipedia.org/wiki/Torsten_Wiesel), that our brain has specialized nerve cells |
||||
responding to specific local features of a scene, such as lines, edges, angles or movement. Since we |
||||
don't see the world as scattered pieces, our visual cortex must somehow combine the different |
||||
sources of information into useful patterns. Automatic face recognition is all about extracting |
||||
those meaningful features from an image, putting them into a useful representation and performing |
||||
some kind of classification on them. |
||||
|
||||
Face recognition based on the geometric features of a face is probably the most intuitive approach |
||||
to face recognition. One of the first automated face recognition systems was described in |
||||
@cite Kanade73 : marker points (position of eyes, ears, nose, ...) were used to build a feature vector |
||||
(distance between the points, angle between them, ...). The recognition was performed by calculating |
||||
the euclidean distance between feature vectors of a probe and reference image. Such a method is |
||||
robust against changes in illumination by its nature, but has a huge drawback: the accurate |
||||
registration of the marker points is complicated, even with state of the art algorithms. Some of the |
||||
latest work on geometric face recognition was carried out in @cite Bru92 . A 22-dimensional feature |
||||
vector was used and experiments on large datasets have shown, that geometrical features alone may not |
||||
carry enough information for face recognition. |
||||
|
||||
The Eigenfaces method described in @cite TP91 took a holistic approach to face recognition: A facial |
||||
image is a point from a high-dimensional image space and a lower-dimensional representation is |
||||
found, where classification becomes easy. The lower-dimensional subspace is found with Principal |
||||
Component Analysis, which identifies the axes with maximum variance. While this kind of |
||||
transformation is optimal from a reconstruction standpoint, it doesn't take any class labels into |
||||
account. Imagine a situation where the variance is generated from external sources, let it be light. |
||||
The axes with maximum variance do not necessarily contain any discriminative information at all, |
||||
hence a classification becomes impossible. So a class-specific projection with a Linear Discriminant |
||||
Analysis was applied to face recognition in @cite BHK97 . The basic idea is to minimize the variance |
||||
within a class, while maximizing the variance between the classes at the same time. |
||||
|
||||
Recently various methods for a local feature extraction emerged. To avoid the high-dimensionality of |
||||
the input data only local regions of an image are described, the extracted features are (hopefully) |
||||
more robust against partial occlusion, illumation and small sample size. Algorithms used for a local |
||||
feature extraction are Gabor Wavelets (@cite Wiskott97), Discrete Cosinus Transform (@cite Messer06) and |
||||
Local Binary Patterns (@cite AHP04). It's still an open research question what's the best way to |
||||
preserve spatial information when applying a local feature extraction, because spatial information |
||||
is potentially useful information. |
||||
|
||||
Face Database {#tutorial_face_facedb} |
||||
------------- |
||||
|
||||
Let's get some data to experiment with first. I don't want to do a toy example here. We are doing |
||||
face recognition, so you'll need some face images! You can either create your own dataset or start |
||||
with one of the available face databases, |
||||
[<http://face-rec.org/databases/>](http://face-rec.org/databases) gives you an up-to-date overview. |
||||
Three interesting databases are (parts of the description are quoted from |
||||
[<http://face-rec.org>](http://face-rec.org)): |
||||
|
||||
- [AT&T Facedatabase](http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html) The AT&T |
||||
Facedatabase, sometimes also referred to as *ORL Database of Faces*, contains ten different |
||||
images of each of 40 distinct subjects. For some subjects, the images were taken at different |
||||
times, varying the lighting, facial expressions (open / closed eyes, smiling / not smiling) and |
||||
facial details (glasses / no glasses). All the images were taken against a dark homogeneous |
||||
background with the subjects in an upright, frontal position (with tolerance for some side |
||||
movement). |
||||
- [Yale Facedatabase A](http://vision.ucsd.edu/content/yale-face-database), also known as |
||||
Yalefaces. The AT&T Facedatabase is good for initial tests, but it's a fairly easy database. The |
||||
Eigenfaces method already has a 97% recognition rate on it, so you won't see any great |
||||
improvements with other algorithms. The Yale Facedatabase A (also known as Yalefaces) is a more |
||||
appropriate dataset for initial experiments, because the recognition problem is harder. The |
||||
database consists of 15 people (14 male, 1 female) each with 11 grayscale images sized |
||||
\f$320 \times 243\f$ pixel. There are changes in the light conditions (center light, left light, |
||||
right light), facial expressions (happy, normal, sad, sleepy, surprised, wink) and glasses |
||||
(glasses, no-glasses). |
||||
|
||||
The original images are not cropped and aligned. Please look into the @ref face_appendix for a |
||||
Python script, that does the job for you. |
||||
|
||||
- [Extended Yale Facedatabase B](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html) The |
||||
Extended Yale Facedatabase B contains 2414 images of 38 different people in its cropped version. |
||||
The focus of this database is set on extracting features that are robust to illumination, the |
||||
images have almost no variation in emotion/occlusion/... . I personally think, that this dataset |
||||
is too large for the experiments I perform in this document. You better use the [AT&T |
||||
Facedatabase](http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html) for intial |
||||
testing. A first version of the Yale Facedatabase B was used in @cite BHK97 to see how the |
||||
Eigenfaces and Fisherfaces method perform under heavy illumination changes. @cite Lee05 used the |
||||
same setup to take 16128 images of 28 people. The Extended Yale Facedatabase B is the merge of |
||||
the two databases, which is now known as Extended Yalefacedatabase B. |
||||
|
||||
### Preparing the data {#tutorial_face_prepare} |
||||
|
||||
Once we have acquired some data, we'll need to read it in our program. In the demo applications I |
||||
have decided to read the images from a very simple CSV file. Why? Because it's the simplest |
||||
platform-independent approach I can think of. However, if you know a simpler solution please ping me |
||||
about it. Basically all the CSV file needs to contain are lines composed of a filename followed by a |
||||
; followed by the label (as *integer number*), making up a line like this: |
||||
|
||||
@code{.csv} |
||||
/path/to/image.ext;0 |
||||
@endcode |
||||
|
||||
Let's dissect the line. /path/to/image.ext is the path to an image, probably something like this if |
||||
you are in Windows: C:/faces/person0/image0.jpg. Then there is the separator ; and finally we assign |
||||
the label 0 to the image. Think of the label as the subject (the person) this image belongs to, so |
||||
same subjects (persons) should have the same label. |
||||
|
||||
Download the AT&T Facedatabase from AT&T Facedatabase and the corresponding CSV file from at.txt, |
||||
which looks like this (file is without ... of course): |
||||
|
||||
@code{.csv} |
||||
./at/s1/1.pgm;0 |
||||
./at/s1/2.pgm;0 |
||||
... |
||||
./at/s2/1.pgm;1 |
||||
./at/s2/2.pgm;1 |
||||
... |
||||
./at/s40/1.pgm;39 |
||||
./at/s40/2.pgm;39 |
||||
@endcode |
||||
|
||||
Imagine I have extracted the files to D:/data/at and have downloaded the CSV file to D:/data/at.txt. |
||||
Then you would simply need to Search & Replace ./ with D:/data/. You can do that in an editor of |
||||
your choice, every sufficiently advanced editor can do this. Once you have a CSV file with valid |
||||
filenames and labels, you can run any of the demos by passing the path to the CSV file as parameter: |
||||
|
||||
@code{.sh} |
||||
facerec_demo.exe D:/data/at.txt |
||||
@endcode |
||||
|
||||
Please, see @ref tutorial_face_appendix_csv for details on creating CSV file. |
||||
|
||||
Eigenfaces {#tutorial_face_eigenfaces} |
||||
---------- |
||||
|
||||
The problem with the image representation we are given is its high dimensionality. Two-dimensional |
||||
\f$p \times q\f$ grayscale images span a \f$m = pq\f$-dimensional vector space, so an image with |
||||
\f$100 \times 100\f$ pixels lies in a \f$10,000\f$-dimensional image space already. The question is: Are all |
||||
dimensions equally useful for us? We can only make a decision if there's any variance in data, so |
||||
what we are looking for are the components that account for most of the information. The Principal |
||||
Component Analysis (PCA) was independently proposed by [Karl |
||||
Pearson](http://en.wikipedia.org/wiki/Karl_Pearson) (1901) and [Harold |
||||
Hotelling](http://en.wikipedia.org/wiki/Harold_Hotelling) (1933) to turn a set of possibly |
||||
correlated variables into a smaller set of uncorrelated variables. The idea is, that a |
||||
high-dimensional dataset is often described by correlated variables and therefore only a few |
||||
meaningful dimensions account for most of the information. The PCA method finds the directions with |
||||
the greatest variance in the data, called principal components. |
||||
|
||||
### Algorithmic Description of Eigenfaces method {#tutorial_face_eigenfaces_algo} |
||||
|
||||
Let \f$X = \{ x_{1}, x_{2}, \ldots, x_{n} \}\f$ be a random vector with observations \f$x_i \in R^{d}\f$. |
||||
|
||||
1. Compute the mean \f$\mu\f$ |
||||
|
||||
\f[\mu = \frac{1}{n} \sum_{i=1}^{n} x_{i}\f] |
||||
|
||||
2. Compute the the Covariance Matrix S |
||||
|
||||
\f[S = \frac{1}{n} \sum_{i=1}^{n} (x_{i} - \mu) (x_{i} - \mu)^{T}`\f] |
||||
|
||||
3. Compute the eigenvalues \f$\lambda_{i}\f$ and eigenvectors \f$v_{i}\f$ of \f$S\f$ |
||||
|
||||
\f[S v_{i} = \lambda_{i} v_{i}, i=1,2,\ldots,n\f] |
||||
|
||||
4. Order the eigenvectors descending by their eigenvalue. The \f$k\f$ principal components are the |
||||
eigenvectors corresponding to the \f$k\f$ largest eigenvalues. |
||||
|
||||
The \f$k\f$ principal components of the observed vector \f$x\f$ are then given by: |
||||
|
||||
\f[y = W^{T} (x - \mu)\f] |
||||
|
||||
where \f$W = (v_{1}, v_{2}, \ldots, v_{k})\f$. |
||||
|
||||
The reconstruction from the PCA basis is given by: |
||||
|
||||
\f[x = W y + \mu\f] |
||||
|
||||
where \f$W = (v_{1}, v_{2}, \ldots, v_{k})\f$. |
||||
|
||||
The Eigenfaces method then performs face recognition by: |
||||
|
||||
- Projecting all training samples into the PCA subspace. |
||||
- Projecting the query image into the PCA subspace. |
||||
- Finding the nearest neighbor between the projected training images and the projected query |
||||
image. |
||||
|
||||
Still there's one problem left to solve. Imagine we are given \f$400\f$ images sized \f$100 \times 100\f$ |
||||
pixel. The Principal Component Analysis solves the covariance matrix \f$S = X X^{T}\f$, where |
||||
\f${size}(X) = 10000 \times 400\f$ in our example. You would end up with a \f$10000 \times 10000\f$ matrix, |
||||
roughly \f$0.8 GB\f$. Solving this problem isn't feasible, so we'll need to apply a trick. From your |
||||
linear algebra lessons you know that a \f$M \times N\f$ matrix with \f$M > N\f$ can only have \f$N - 1\f$ |
||||
non-zero eigenvalues. So it's possible to take the eigenvalue decomposition \f$S = X^{T} X\f$ of size |
||||
\f$N \times N\f$ instead: |
||||
|
||||
\f[X^{T} X v_{i} = \lambda_{i} v{i}\f] |
||||
|
||||
and get the original eigenvectors of \f$S = X X^{T}\f$ with a left multiplication of the data matrix: |
||||
|
||||
\f[X X^{T} (X v_{i}) = \lambda_{i} (X v_{i})\f] |
||||
|
||||
The resulting eigenvectors are orthogonal, to get orthonormal eigenvectors they need to be |
||||
normalized to unit length. I don't want to turn this into a publication, so please look into |
||||
@cite Duda01 for the derivation and proof of the equations. |
||||
|
||||
### Eigenfaces in OpenCV {#tutorial_face_eigenfaces_use} |
||||
|
||||
For the first source code example, I'll go through it with you. I am first giving you the whole |
||||
source code listing, and after this we'll look at the most important lines in detail. Please note: |
||||
every source code listing is commented in detail, so you should have no problems following it. |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_eigenfaces.cpp |
||||
|
||||
I've used the jet colormap, so you can see how the grayscale values are distributed within the |
||||
specific Eigenfaces. You can see, that the Eigenfaces do not only encode facial features, but also |
||||
the illumination in the images (see the left light in Eigenface \#4, right light in Eigenfaces \#5): |
||||
|
||||
 |
||||
|
||||
We've already seen, that we can reconstruct a face from its lower dimensional approximation. So |
||||
let's see how many Eigenfaces are needed for a good reconstruction. I'll do a subplot with |
||||
\f$10,30,\ldots,310\f$ Eigenfaces: |
||||
|
||||
@code{.cpp} |
||||
// Display or save the image reconstruction at some predefined steps: |
||||
for(int num_components = 10; num_components < 300; num_components+=15) { |
||||
// slice the eigenvectors from the model |
||||
Mat evs = Mat(W, Range::all(), Range(0, num_components)); |
||||
Mat projection = LDA::subspaceProject(evs, mean, images[0].reshape(1,1)); |
||||
Mat reconstruction = LDA::subspaceReconstruct(evs, mean, projection); |
||||
// Normalize the result: |
||||
reconstruction = norm_0_255(reconstruction.reshape(1, images[0].rows)); |
||||
// Display or save: |
||||
if(argc == 2) { |
||||
imshow(format("eigenface_reconstruction_%d", num_components), reconstruction); |
||||
} else { |
||||
imwrite(format("%s/eigenface_reconstruction_%d.png", output_folder.c_str(), num_components), reconstruction); |
||||
} |
||||
} |
||||
@endcode |
||||
|
||||
10 Eigenvectors are obviously not sufficient for a good image reconstruction, 50 Eigenvectors may |
||||
already be sufficient to encode important facial features. You'll get a good reconstruction with |
||||
approximately 300 Eigenvectors for the AT&T Facedatabase. There are rule of thumbs how many |
||||
Eigenfaces you should choose for a successful face recognition, but it heavily depends on the input |
||||
data. @cite Zhao03 is the perfect point to start researching for this: |
||||
|
||||
 |
||||
|
||||
Fisherfaces {#tutorial_face_fisherfaces} |
||||
----------- |
||||
|
||||
The Principal Component Analysis (PCA), which is the core of the Eigenfaces method, finds a linear |
||||
combination of features that maximizes the total variance in data. While this is clearly a powerful |
||||
way to represent data, it doesn't consider any classes and so a lot of discriminative information |
||||
*may* be lost when throwing components away. Imagine a situation where the variance in your data is |
||||
generated by an external source, let it be the light. The components identified by a PCA do not |
||||
necessarily contain any discriminative information at all, so the projected samples are smeared |
||||
together and a classification becomes impossible (see |
||||
[<http://www.bytefish.de/wiki/pca_lda_with_gnu_octave>](http://www.bytefish.de/wiki/pca_lda_with_gnu_octave) |
||||
for an example). |
||||
|
||||
The Linear Discriminant Analysis performs a class-specific dimensionality reduction and was invented |
||||
by the great statistician [Sir R. A. Fisher](http://en.wikipedia.org/wiki/Ronald_Fisher). He |
||||
successfully used it for classifying flowers in his 1936 paper *The use of multiple measurements in |
||||
taxonomic problems* @cite Fisher36 . In order to find the combination of features that separates best |
||||
between classes the Linear Discriminant Analysis maximizes the ratio of between-classes to |
||||
within-classes scatter, instead of maximizing the overall scatter. The idea is simple: same classes |
||||
should cluster tightly together, while different classes are as far away as possible from each other |
||||
in the lower-dimensional representation. This was also recognized by |
||||
[Belhumeur](http://www.cs.columbia.edu/~belhumeur/), [Hespanha](http://www.ece.ucsb.edu/~hespanha/) |
||||
and [Kriegman](http://cseweb.ucsd.edu/~kriegman/) and so they applied a Discriminant Analysis to |
||||
face recognition in @cite BHK97 . |
||||
|
||||
### Algorithmic Description of Fisherfaces method {#tutorial_face_fisherfaces_algo} |
||||
|
||||
Let \f$X\f$ be a random vector with samples drawn from \f$c\f$ classes: |
||||
|
||||
\f[\begin{align*} |
||||
X & = & \{X_1,X_2,\ldots,X_c\} \\ |
||||
X_i & = & \{x_1, x_2, \ldots, x_n\} |
||||
\end{align*}\f] |
||||
|
||||
The scatter matrices \f$S_{B}\f$ and S\_{W} are calculated as: |
||||
|
||||
\f[\begin{align*} |
||||
S_{B} & = & \sum_{i=1}^{c} N_{i} (\mu_i - \mu)(\mu_i - \mu)^{T} \\ |
||||
S_{W} & = & \sum_{i=1}^{c} \sum_{x_{j} \in X_{i}} (x_j - \mu_i)(x_j - \mu_i)^{T} |
||||
\end{align*}\f] |
||||
|
||||
, where \f$\mu\f$ is the total mean: |
||||
|
||||
\f[\mu = \frac{1}{N} \sum_{i=1}^{N} x_i\f] |
||||
|
||||
And \f$\mu_i\f$ is the mean of class \f$i \in \{1,\ldots,c\}\f$: |
||||
|
||||
\f[\mu_i = \frac{1}{|X_i|} \sum_{x_j \in X_i} x_j\f] |
||||
|
||||
Fisher's classic algorithm now looks for a projection \f$W\f$, that maximizes the class separability |
||||
criterion: |
||||
|
||||
\f[W_{opt} = \operatorname{arg\,max}_{W} \frac{|W^T S_B W|}{|W^T S_W W|}\f] |
||||
|
||||
Following @cite BHK97, a solution for this optimization problem is given by solving the General |
||||
Eigenvalue Problem: |
||||
|
||||
\f[\begin{align*} |
||||
S_{B} v_{i} & = & \lambda_{i} S_w v_{i} \nonumber \\ |
||||
S_{W}^{-1} S_{B} v_{i} & = & \lambda_{i} v_{i} |
||||
\end{align*}\f] |
||||
|
||||
There's one problem left to solve: The rank of \f$S_{W}\f$ is at most \f$(N-c)\f$, with \f$N\f$ samples and \f$c\f$ |
||||
classes. In pattern recognition problems the number of samples \f$N\f$ is almost always samller than the |
||||
dimension of the input data (the number of pixels), so the scatter matrix \f$S_{W}\f$ becomes singular |
||||
(see @cite RJ91). In @cite BHK97 this was solved by performing a Principal Component Analysis on the |
||||
data and projecting the samples into the \f$(N-c)\f$-dimensional space. A Linear Discriminant Analysis |
||||
was then performed on the reduced data, because \f$S_{W}\f$ isn't singular anymore. |
||||
|
||||
The optimization problem can then be rewritten as: |
||||
|
||||
\f[\begin{align*} |
||||
W_{pca} & = & \operatorname{arg\,max}_{W} |W^T S_T W| \\ |
||||
W_{fld} & = & \operatorname{arg\,max}_{W} \frac{|W^T W_{pca}^T S_{B} W_{pca} W|}{|W^T W_{pca}^T S_{W} W_{pca} W|} |
||||
\end{align*}\f] |
||||
|
||||
The transformation matrix \f$W\f$, that projects a sample into the \f$(c-1)\f$-dimensional space is then |
||||
given by: |
||||
|
||||
\f[W = W_{fld}^{T} W_{pca}^{T}\f] |
||||
|
||||
### Fisherfaces in OpenCV {#tutorial_face_fisherfaces_use} |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_fisherfaces.cpp |
||||
|
||||
For this example I am going to use the Yale Facedatabase A, just because the plots are nicer. Each |
||||
Fisherface has the same length as an original image, thus it can be displayed as an image. The demo |
||||
shows (or saves) the first, at most 16 Fisherfaces: |
||||
|
||||
 |
||||
|
||||
The Fisherfaces method learns a class-specific transformation matrix, so the they do not capture |
||||
illumination as obviously as the Eigenfaces method. The Discriminant Analysis instead finds the |
||||
facial features to discriminate between the persons. It's important to mention, that the performance |
||||
of the Fisherfaces heavily depends on the input data as well. Practically said: if you learn the |
||||
Fisherfaces for well-illuminated pictures only and you try to recognize faces in bad-illuminated |
||||
scenes, then method is likely to find the wrong components (just because those features may not be |
||||
predominant on bad illuminated images). This is somewhat logical, since the method had no chance to |
||||
learn the illumination. |
||||
|
||||
The Fisherfaces allow a reconstruction of the projected image, just like the Eigenfaces did. But |
||||
since we only identified the features to distinguish between subjects, you can't expect a nice |
||||
reconstruction of the original image. For the Fisherfaces method we'll project the sample image onto |
||||
each of the Fisherfaces instead. So you'll have a nice visualization, which feature each of the |
||||
Fisherfaces describes: |
||||
|
||||
@code{.cpp} |
||||
// Display or save the image reconstruction at some predefined steps: |
||||
for(int num_component = 0; num_component < min(16, W.cols); num_component++) { |
||||
// Slice the Fisherface from the model: |
||||
Mat ev = W.col(num_component); |
||||
Mat projection = LDA::subspaceProject(ev, mean, images[0].reshape(1,1)); |
||||
Mat reconstruction = LDA::subspaceReconstruct(ev, mean, projection); |
||||
// Normalize the result: |
||||
reconstruction = norm_0_255(reconstruction.reshape(1, images[0].rows)); |
||||
// Display or save: |
||||
if(argc == 2) { |
||||
imshow(format("fisherface_reconstruction_%d", num_component), reconstruction); |
||||
} else { |
||||
imwrite(format("%s/fisherface_reconstruction_%d.png", output_folder.c_str(), num_component), reconstruction); |
||||
} |
||||
} |
||||
@endcode |
||||
|
||||
The differences may be subtle for the human eyes, but you should be able to see some differences: |
||||
|
||||
 |
||||
|
||||
Local Binary Patterns Histograms {#tutorial_face_lbph} |
||||
-------------------------------- |
||||
|
||||
Eigenfaces and Fisherfaces take a somewhat holistic approach to face recognition. You treat your |
||||
data as a vector somewhere in a high-dimensional image space. We all know high-dimensionality is |
||||
bad, so a lower-dimensional subspace is identified, where (probably) useful information is |
||||
preserved. The Eigenfaces approach maximizes the total scatter, which can lead to problems if the |
||||
variance is generated by an external source, because components with a maximum variance over all |
||||
classes aren't necessarily useful for classification (see |
||||
[<http://www.bytefish.de/wiki/pca_lda_with_gnu_octave>](http://www.bytefish.de/wiki/pca_lda_with_gnu_octave)). |
||||
So to preserve some discriminative information we applied a Linear Discriminant Analysis and |
||||
optimized as described in the Fisherfaces method. The Fisherfaces method worked great... at least |
||||
for the constrained scenario we've assumed in our model. |
||||
|
||||
Now real life isn't perfect. You simply can't guarantee perfect light settings in your images or 10 |
||||
different images of a person. So what if there's only one image for each person? Our covariance |
||||
estimates for the subspace *may* be horribly wrong, so will the recognition. Remember the Eigenfaces |
||||
method had a 96% recognition rate on the AT&T Facedatabase? How many images do we actually need to |
||||
get such useful estimates? Here are the Rank-1 recognition rates of the Eigenfaces and Fisherfaces |
||||
method on the AT&T Facedatabase, which is a fairly easy image database: |
||||
|
||||
 |
||||
|
||||
So in order to get good recognition rates you'll need at least 8(+-1) images for each person and the |
||||
Fisherfaces method doesn't really help here. The above experiment is a 10-fold cross validated |
||||
result carried out with the facerec framework at: |
||||
[<https://github.com/bytefish/facerec>](https://github.com/bytefish/facerec). This is not a |
||||
publication, so I won't back these figures with a deep mathematical analysis. Please have a look |
||||
into @cite KM01 for a detailed analysis of both methods, when it comes to small training datasets. |
||||
|
||||
So some research concentrated on extracting local features from images. The idea is to not look at |
||||
the whole image as a high-dimensional vector, but describe only local features of an object. The |
||||
features you extract this way will have a low-dimensionality implicitly. A fine idea! But you'll |
||||
soon observe the image representation we are given doesn't only suffer from illumination variations. |
||||
Think of things like scale, translation or rotation in images - your local description has to be at |
||||
least a bit robust against those things. Just like SIFT, the Local Binary Patterns methodology has |
||||
its roots in 2D texture analysis. The basic idea of Local Binary Patterns is to summarize the local |
||||
structure in an image by comparing each pixel with its neighborhood. Take a pixel as center and |
||||
threshold its neighbors against. If the intensity of the center pixel is greater-equal its neighbor, |
||||
then denote it with 1 and 0 if not. You'll end up with a binary number for each pixel, just like |
||||
11001111. So with 8 surrounding pixels you'll end up with 2\^8 possible combinations, called *Local |
||||
Binary Patterns* or sometimes referred to as *LBP codes*. The first LBP operator described in |
||||
literature actually used a fixed 3 x 3 neighborhood just like this: |
||||
|
||||
 |
||||
|
||||
### Algorithmic Description of LBPH method {#tutorial_face_lbph_algo} |
||||
|
||||
A more formal description of the LBP operator can be given as: |
||||
|
||||
\f[LBP(x_c, y_c) = \sum_{p=0}^{P-1} 2^p s(i_p - i_c)\f] |
||||
|
||||
, with \f$(x_c, y_c)\f$ as central pixel with intensity \f$i_c\f$; and \f$i_n\f$ being the intensity of the the |
||||
neighbor pixel. \f$s\f$ is the sign function defined as: |
||||
|
||||
\f[\begin{equation} |
||||
s(x) = |
||||
\begin{cases} |
||||
1 & \text{if \(x \geq 0\)}\\ |
||||
0 & \text{else} |
||||
\end{cases} |
||||
\end{equation}\f] |
||||
|
||||
This description enables you to capture very fine grained details in images. In fact the authors |
||||
were able to compete with state of the art results for texture classification. Soon after the |
||||
operator was published it was noted, that a fixed neighborhood fails to encode details differing in |
||||
scale. So the operator was extended to use a variable neighborhood in @cite AHP04 . The idea is to |
||||
align an abritrary number of neighbors on a circle with a variable radius, which enables to capture |
||||
the following neighborhoods: |
||||
|
||||
 |
||||
|
||||
For a given Point \f$(x_c,y_c)\f$ the position of the neighbor \f$(x_p,y_p), p \in P\f$ can be calculated |
||||
by: |
||||
|
||||
\f[\begin{align*} |
||||
x_{p} & = & x_c + R \cos({\frac{2\pi p}{P}})\\ |
||||
y_{p} & = & y_c - R \sin({\frac{2\pi p}{P}}) |
||||
\end{align*}\f] |
||||
|
||||
Where \f$R\f$ is the radius of the circle and \f$P\f$ is the number of sample points. |
||||
|
||||
The operator is an extension to the original LBP codes, so it's sometimes called *Extended LBP* |
||||
(also referred to as *Circular LBP*) . If a points coordinate on the circle doesn't correspond to |
||||
image coordinates, the point get's interpolated. Computer science has a bunch of clever |
||||
interpolation schemes, the OpenCV implementation does a bilinear interpolation: |
||||
|
||||
\f[\begin{align*} |
||||
f(x,y) \approx \begin{bmatrix} |
||||
1-x & x \end{bmatrix} \begin{bmatrix} |
||||
f(0,0) & f(0,1) \\ |
||||
f(1,0) & f(1,1) \end{bmatrix} \begin{bmatrix} |
||||
1-y \\ |
||||
y \end{bmatrix}. |
||||
\end{align*}\f] |
||||
|
||||
By definition the LBP operator is robust against monotonic gray scale transformations. We can easily |
||||
verify this by looking at the LBP image of an artificially modified image (so you see what an LBP |
||||
image looks like!): |
||||
|
||||
 |
||||
|
||||
So what's left to do is how to incorporate the spatial information in the face recognition model. |
||||
The representation proposed by Ahonen et. al @cite AHP04 is to divide the LBP image into \f$m\f$ local |
||||
regions and extract a histogram from each. The spatially enhanced feature vector is then obtained by |
||||
concatenating the local histograms (**not merging them**). These histograms are called *Local Binary |
||||
Patterns Histograms*. |
||||
|
||||
### Local Binary Patterns Histograms in OpenCV {#tutorial_face_lbph_use} |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_lbph.cpp |
||||
|
||||
Conclusion {#tutorial_face_conclusion} |
||||
---------- |
||||
|
||||
You've learned how to use the new FaceRecognizer in real applications. After reading the document |
||||
you also know how the algorithms work, so now it's time for you to experiment with the available |
||||
algorithms. Use them, improve them and let the OpenCV community participate! |
||||
|
||||
Credits {#tutorial_face_credits} |
||||
------- |
||||
|
||||
This document wouldn't be possible without the kind permission to use the face images of the *AT&T |
||||
Database of Faces* and the *Yale Facedatabase A/B*. |
||||
|
||||
### The Database of Faces {#tutorial_face_credits_db} |
||||
|
||||
__Important: when using these images, please give credit to "AT&T Laboratories, Cambridge."__ |
||||
|
||||
The Database of Faces, formerly *The ORL Database of Faces*, contains a set of face images taken |
||||
between April 1992 and April 1994. The database was used in the context of a face recognition |
||||
project carried out in collaboration with the Speech, Vision and Robotics Group of the Cambridge |
||||
University Engineering Department. |
||||
|
||||
There are ten different images of each of 40 distinct subjects. For some subjects, the images were |
||||
taken at different times, varying the lighting, facial expressions (open / closed eyes, smiling / |
||||
not smiling) and facial details (glasses / no glasses). All the images were taken against a dark |
||||
homogeneous background with the subjects in an upright, frontal position (with tolerance for some |
||||
side movement). |
||||
|
||||
The files are in PGM format. The size of each image is 92x112 pixels, with 256 grey levels per |
||||
pixel. The images are organised in 40 directories (one for each subject), which have names of the |
||||
form sX, where X indicates the subject number (between 1 and 40). In each of these directories, |
||||
there are ten different images of that subject, which have names of the form Y.pgm, where Y is the |
||||
image number for that subject (between 1 and 10). |
||||
|
||||
A copy of the database can be retrieved from: |
||||
[<http://www.cl.cam.ac.uk/research/dtg/attarchive/pub/data/att_faces.zip>](http://www.cl.cam.ac.uk/research/dtg/attarchive/pub/data/att_faces.zip). |
||||
|
||||
### Yale Facedatabase A {#tutorial_face_credits_yalea} |
||||
|
||||
*With the permission of the authors I am allowed to show a small number of images (say subject 1 and |
||||
all the variations) and all images such as Fisherfaces and Eigenfaces from either Yale Facedatabase |
||||
A or the Yale Facedatabase B.* |
||||
|
||||
The Yale Face Database A (size 6.4MB) contains 165 grayscale images in GIF format of 15 individuals. |
||||
There are 11 images per subject, one per different facial expression or configuration: center-light, |
||||
w/glasses, happy, left-light, w/no glasses, normal, right-light, sad, sleepy, surprised, and wink. |
||||
(Source: |
||||
[<http://cvc.yale.edu/projects/yalefaces/yalefaces.html>](http://cvc.yale.edu/projects/yalefaces/yalefaces.html)) |
||||
|
||||
### Yale Facedatabase B {#tutorial_face_credits_yaleb} |
||||
|
||||
*With the permission of the authors I am allowed to show a small number of images (say subject 1 and |
||||
all the variations) and all images such as Fisherfaces and Eigenfaces from either Yale Facedatabase |
||||
A or the Yale Facedatabase B.* |
||||
|
||||
The extended Yale Face Database B contains 16128 images of 28 human subjects under 9 poses and 64 |
||||
illumination conditions. The data format of this database is the same as the Yale Face Database B. |
||||
Please refer to the homepage of the Yale Face Database B (or one copy of this page) for more |
||||
detailed information of the data format. |
||||
|
||||
You are free to use the extended Yale Face Database B for research purposes. All publications which |
||||
use this database should acknowledge the use of "the Exteded Yale Face Database B" and reference |
||||
Athinodoros Georghiades, Peter Belhumeur, and David Kriegman's paper, "From Few to Many: |
||||
Illumination Cone Models for Face Recognition under Variable Lighting and Pose", PAMI, 2001, |
||||
[[bibtex]](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/athosref.html). |
||||
|
||||
The extended database as opposed to the original Yale Face Database B with 10 subjects was first |
||||
reported by Kuang-Chih Lee, Jeffrey Ho, and David Kriegman in "Acquiring Linear Subspaces for Face |
||||
Recognition under Variable Lighting, PAMI, May, 2005 |
||||
[[pdf]](http://vision.ucsd.edu/~leekc/papers/9pltsIEEE.pdf)." All test image data used in the |
||||
experiments are manually aligned, cropped, and then re-sized to 168x192 images. If you publish your |
||||
experimental results with the cropped images, please reference the PAMI2005 paper as well. (Source: |
||||
[<http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html>](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html)) |
||||
|
||||
Appendix {#face_appendix} |
||||
-------- |
||||
|
||||
### Creating the CSV File {#tutorial_face_appendix_csv} |
||||
|
||||
You don't really want to create the CSV file by hand. I have prepared you a little Python script |
||||
`create_csv.py` (you find it at `src/create_csv.py` coming with this tutorial) that automatically |
||||
creates you a CSV file. If you have your images in hierarchie like this |
||||
(`/basepath/<subject>/<image.ext>`): |
||||
|
||||
@code{.sh} |
||||
philipp@mango:~/facerec/data/at$ tree |
||||
. |
||||
|-- s1 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
|-- s2 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
... |
||||
|-- s40 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
@endcode |
||||
|
||||
Then simply call `create_csv.py at` , here 'at' being the basepath to the folder, just like this and you could save the |
||||
output: |
||||
|
||||
@code{.sh} |
||||
philipp@mango:~/facerec/data$ python create_csv.py at |
||||
at/s13/2.pgm;0 |
||||
at/s13/7.pgm;0 |
||||
at/s13/6.pgm;0 |
||||
at/s13/9.pgm;0 |
||||
at/s13/5.pgm;0 |
||||
at/s13/3.pgm;0 |
||||
at/s13/4.pgm;0 |
||||
at/s13/10.pgm;0 |
||||
at/s13/8.pgm;0 |
||||
at/s13/1.pgm;0 |
||||
at/s17/2.pgm;1 |
||||
at/s17/7.pgm;1 |
||||
at/s17/6.pgm;1 |
||||
at/s17/9.pgm;1 |
||||
at/s17/5.pgm;1 |
||||
at/s17/3.pgm;1 |
||||
[...] |
||||
@endcode |
||||
|
||||
Here is the script, if you can't find it: |
||||
|
||||
@verbinclude face/samples/etc/create_csv.py |
||||
|
||||
### Aligning Face Images {#tutorial_face_appendix_align} |
||||
|
||||
An accurate alignment of your image data is especially important in tasks like emotion detection, |
||||
were you need as much detail as possible. Believe me... You don't want to do this by hand. So I've |
||||
prepared you a tiny Python script. The code is really easy to use. To scale, rotate and crop the |
||||
face image you just need to call *CropFace(image, eye_left, eye_right, offset_pct, dest_sz)*, |
||||
where: |
||||
|
||||
- *eye_left* is the position of the left eye |
||||
- *eye_right* is the position of the right eye |
||||
- *offset_pct* is the percent of the image you want to keep next to the eyes (horizontal, |
||||
vertical direction) |
||||
- *dest_sz* is the size of the output image |
||||
|
||||
If you are using the same *offset_pct* and *dest_sz* for your images, they are all aligned at the |
||||
eyes. |
||||
|
||||
@verbinclude face/samples/etc/crop_face.py |
||||
|
||||
Imagine we are given [this photo of Arnold |
||||
Schwarzenegger](http://en.wikipedia.org/wiki/File:Arnold_Schwarzenegger_edit%28ws%29.jpg), which is |
||||
under a Public Domain license. The (x,y)-position of the eyes is approximately *(252,364)* for the |
||||
left and *(420,366)* for the right eye. Now you only need to define the horizontal offset, vertical |
||||
offset and the size your scaled, rotated & cropped face should have. |
||||
|
||||
Here are some examples: |
||||
|
||||
Configuration | Cropped, Scaled, Rotated Face |
||||
--------------------------------|------------------------------------------------------------------ |
||||
0.1 (10%), 0.1 (10%), (200,200) |  |
||||
0.2 (20%), 0.2 (20%), (200,200) |  |
||||
0.3 (30%), 0.3 (30%), (200,200) |  |
||||
0.2 (20%), 0.2 (20%), (70,70) |  |
||||
|
||||
### CSV for the AT&T Facedatabase {#tutorial_face_appendix_attcsv} |
||||
|
||||
@verbinclude face/samples/etc/at.txt |
Before Width: | Height: | Size: 290 KiB After Width: | Height: | Size: 290 KiB |
Before Width: | Height: | Size: 5.4 KiB After Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 6.2 KiB After Width: | Height: | Size: 6.2 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 7.1 KiB After Width: | Height: | Size: 7.1 KiB |
Before Width: | Height: | Size: 92 KiB After Width: | Height: | Size: 92 KiB |
Before Width: | Height: | Size: 36 KiB After Width: | Height: | Size: 36 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 9.9 KiB After Width: | Height: | Size: 9.9 KiB |
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 171 KiB After Width: | Height: | Size: 171 KiB |
Before Width: | Height: | Size: 108 KiB After Width: | Height: | Size: 108 KiB |
Before Width: | Height: | Size: 111 KiB After Width: | Height: | Size: 111 KiB |
Before Width: | Height: | Size: 281 KiB After Width: | Height: | Size: 281 KiB |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 83 KiB After Width: | Height: | Size: 83 KiB |
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
@ -1,699 +1,44 @@ |
||||
Face Recognition with OpenCV {#tutorial_face_main} |
||||
============================ |
||||
Tutorials for face module {#tutorial_table_of_content_face} |
||||
========================= |
||||
|
||||
[TOC] |
||||
- @subpage tutorial_face_main |
||||
|
||||
Introduction {#tutorial_face_intro} |
||||
============ |
||||
Face Recognition using OpenCV |
||||
|
||||
[OpenCV (Open Source Computer Vision)](http://opencv.org) is a popular computer vision library |
||||
started by [Intel](http://www.intel.com) in 1999. The cross-platform library sets its focus on |
||||
real-time image processing and includes patent-free implementations of the latest computer vision |
||||
algorithms. In 2008 [Willow Garage](http://www.willowgarage.com) took over support and OpenCV 2.3.1 |
||||
now comes with a programming interface to C, C++, [Python](http://www.python.org) and |
||||
[Android](http://www.android.com). OpenCV is released under a BSD license so it is used in academic |
||||
projects and commercial products alike. |
||||
- @subpage tutorial_face_landmark_detection_in_an_image |
||||
|
||||
OpenCV 2.4 now comes with the very new FaceRecognizer class for face recognition, so you can start |
||||
experimenting with face recognition right away. This document is the guide I've wished for, when I |
||||
was working myself into face recognition. It shows you how to perform face recognition with |
||||
FaceRecognizer in OpenCV (with full source code listings) and gives you an introduction into the |
||||
algorithms behind. I'll also show how to create the visualizations you can find in many |
||||
publications, because a lot of people asked for. |
||||
*Compatibility:* \> OpenCV 3.3 |
||||
|
||||
The currently available algorithms are: |
||||
*Author:* Sukhad Anand |
||||
|
||||
- Eigenfaces (see EigenFaceRecognizer::create) |
||||
- Fisherfaces (see FisherFaceRecognizer::create) |
||||
- Local Binary Patterns Histograms (see LBPHFaceRecognizer::create) |
||||
*Mentor:* Steven Puttemans |
||||
|
||||
You don't need to copy and paste the source code examples from this page, because they are available |
||||
in the src folder coming with this documentation. If you have built OpenCV with the samples turned |
||||
on, chances are good you have them compiled already! Although it might be interesting for very |
||||
advanced users, I've decided to leave the implementation details out as I am afraid they confuse new |
||||
users. |
||||
Face landmark detection in an image using ensemble of regression trees |
||||
|
||||
All code in this document is released under the [BSD |
||||
license](http://www.opensource.org/licenses/bsd-license), so feel free to use it for your projects. |
||||
- @subpage tutorial_face_training_face_landmark_detector |
||||
|
||||
Face Recognition {#tutorial_face_facerec} |
||||
---------------- |
||||
*Compatibility:* \> OpenCV 3.3 |
||||
|
||||
Face recognition is an easy task for humans. Experiments in @cite Tu06 have shown, that even one to |
||||
three day old babies are able to distinguish between known faces. So how hard could it be for a |
||||
computer? It turns out we know little about human recognition to date. Are inner features (eyes, |
||||
nose, mouth) or outer features (head shape, hairline) used for a successful face recognition? How do |
||||
we analyze an image and how does the brain encode it? It was shown by [David |
||||
Hubel](http://en.wikipedia.org/wiki/David_H._Hubel) and [Torsten |
||||
Wiesel](http://en.wikipedia.org/wiki/Torsten_Wiesel), that our brain has specialized nerve cells |
||||
responding to specific local features of a scene, such as lines, edges, angles or movement. Since we |
||||
don't see the world as scattered pieces, our visual cortex must somehow combine the different |
||||
sources of information into useful patterns. Automatic face recognition is all about extracting |
||||
those meaningful features from an image, putting them into a useful representation and performing |
||||
some kind of classification on them. |
||||
*Author:* Sukhad Anand |
||||
|
||||
Face recognition based on the geometric features of a face is probably the most intuitive approach |
||||
to face recognition. One of the first automated face recognition systems was described in |
||||
@cite Kanade73 : marker points (position of eyes, ears, nose, ...) were used to build a feature vector |
||||
(distance between the points, angle between them, ...). The recognition was performed by calculating |
||||
the euclidean distance between feature vectors of a probe and reference image. Such a method is |
||||
robust against changes in illumination by its nature, but has a huge drawback: the accurate |
||||
registration of the marker points is complicated, even with state of the art algorithms. Some of the |
||||
latest work on geometric face recognition was carried out in @cite Bru92 . A 22-dimensional feature |
||||
vector was used and experiments on large datasets have shown, that geometrical features alone may not |
||||
carry enough information for face recognition. |
||||
*Mentor:* Steven Puttemans |
||||
|
||||
The Eigenfaces method described in @cite TP91 took a holistic approach to face recognition: A facial |
||||
image is a point from a high-dimensional image space and a lower-dimensional representation is |
||||
found, where classification becomes easy. The lower-dimensional subspace is found with Principal |
||||
Component Analysis, which identifies the axes with maximum variance. While this kind of |
||||
transformation is optimal from a reconstruction standpoint, it doesn't take any class labels into |
||||
account. Imagine a situation where the variance is generated from external sources, let it be light. |
||||
The axes with maximum variance do not necessarily contain any discriminative information at all, |
||||
hence a classification becomes impossible. So a class-specific projection with a Linear Discriminant |
||||
Analysis was applied to face recognition in @cite BHK97 . The basic idea is to minimize the variance |
||||
within a class, while maximizing the variance between the classes at the same time. |
||||
Training a face landmark detector using an ensemble of regression trees |
||||
|
||||
Recently various methods for a local feature extraction emerged. To avoid the high-dimensionality of |
||||
the input data only local regions of an image are described, the extracted features are (hopefully) |
||||
more robust against partial occlusion, illumation and small sample size. Algorithms used for a local |
||||
feature extraction are Gabor Wavelets (@cite Wiskott97), Discrete Cosinus Transform (@cite Messer06) and |
||||
Local Binary Patterns (@cite AHP04). It's still an open research question what's the best way to |
||||
preserve spatial information when applying a local feature extraction, because spatial information |
||||
is potentially useful information. |
||||
- @subpage tutorial_face_landmark_detection_in_video |
||||
|
||||
Face Database {#tutorial_face_facedb} |
||||
------------- |
||||
*Compatibility:* \> OpenCV 3.3 |
||||
|
||||
Let's get some data to experiment with first. I don't want to do a toy example here. We are doing |
||||
face recognition, so you'll need some face images! You can either create your own dataset or start |
||||
with one of the available face databases, |
||||
[<http://face-rec.org/databases/>](http://face-rec.org/databases) gives you an up-to-date overview. |
||||
Three interesting databases are (parts of the description are quoted from |
||||
[<http://face-rec.org>](http://face-rec.org)): |
||||
*Author:* Sukhad Anand |
||||
|
||||
- [AT&T Facedatabase](http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html) The AT&T |
||||
Facedatabase, sometimes also referred to as *ORL Database of Faces*, contains ten different |
||||
images of each of 40 distinct subjects. For some subjects, the images were taken at different |
||||
times, varying the lighting, facial expressions (open / closed eyes, smiling / not smiling) and |
||||
facial details (glasses / no glasses). All the images were taken against a dark homogeneous |
||||
background with the subjects in an upright, frontal position (with tolerance for some side |
||||
movement). |
||||
- [Yale Facedatabase A](http://vision.ucsd.edu/content/yale-face-database), also known as |
||||
Yalefaces. The AT&T Facedatabase is good for initial tests, but it's a fairly easy database. The |
||||
Eigenfaces method already has a 97% recognition rate on it, so you won't see any great |
||||
improvements with other algorithms. The Yale Facedatabase A (also known as Yalefaces) is a more |
||||
appropriate dataset for initial experiments, because the recognition problem is harder. The |
||||
database consists of 15 people (14 male, 1 female) each with 11 grayscale images sized |
||||
\f$320 \times 243\f$ pixel. There are changes in the light conditions (center light, left light, |
||||
right light), facial expressions (happy, normal, sad, sleepy, surprised, wink) and glasses |
||||
(glasses, no-glasses). |
||||
*Mentor:* Steven Puttemans |
||||
|
||||
The original images are not cropped and aligned. Please look into the @ref face_appendix for a |
||||
Python script, that does the job for you. |
||||
Face lanmark detection in a video running at real time |
||||
|
||||
- [Extended Yale Facedatabase B](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html) The |
||||
Extended Yale Facedatabase B contains 2414 images of 38 different people in its cropped version. |
||||
The focus of this database is set on extracting features that are robust to illumination, the |
||||
images have almost no variation in emotion/occlusion/... . I personally think, that this dataset |
||||
is too large for the experiments I perform in this document. You better use the [AT&T |
||||
Facedatabase](http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html) for intial |
||||
testing. A first version of the Yale Facedatabase B was used in @cite BHK97 to see how the |
||||
Eigenfaces and Fisherfaces method perform under heavy illumination changes. @cite Lee05 used the |
||||
same setup to take 16128 images of 28 people. The Extended Yale Facedatabase B is the merge of |
||||
the two databases, which is now known as Extended Yalefacedatabase B. |
||||
- @subpage tutorial_face_swapping_face_landmark_detection |
||||
|
||||
### Preparing the data {#tutorial_face_prepare} |
||||
*Author:* Sukhad Anand |
||||
|
||||
Once we have acquired some data, we'll need to read it in our program. In the demo applications I |
||||
have decided to read the images from a very simple CSV file. Why? Because it's the simplest |
||||
platform-independent approach I can think of. However, if you know a simpler solution please ping me |
||||
about it. Basically all the CSV file needs to contain are lines composed of a filename followed by a |
||||
; followed by the label (as *integer number*), making up a line like this: |
||||
*Mentor:* Steven Puttemans |
||||
|
||||
@code{.csv} |
||||
/path/to/image.ext;0 |
||||
@endcode |
||||
|
||||
Let's dissect the line. /path/to/image.ext is the path to an image, probably something like this if |
||||
you are in Windows: C:/faces/person0/image0.jpg. Then there is the separator ; and finally we assign |
||||
the label 0 to the image. Think of the label as the subject (the person) this image belongs to, so |
||||
same subjects (persons) should have the same label. |
||||
|
||||
Download the AT&T Facedatabase from AT&T Facedatabase and the corresponding CSV file from at.txt, |
||||
which looks like this (file is without ... of course): |
||||
|
||||
@code{.csv} |
||||
./at/s1/1.pgm;0 |
||||
./at/s1/2.pgm;0 |
||||
... |
||||
./at/s2/1.pgm;1 |
||||
./at/s2/2.pgm;1 |
||||
... |
||||
./at/s40/1.pgm;39 |
||||
./at/s40/2.pgm;39 |
||||
@endcode |
||||
|
||||
Imagine I have extracted the files to D:/data/at and have downloaded the CSV file to D:/data/at.txt. |
||||
Then you would simply need to Search & Replace ./ with D:/data/. You can do that in an editor of |
||||
your choice, every sufficiently advanced editor can do this. Once you have a CSV file with valid |
||||
filenames and labels, you can run any of the demos by passing the path to the CSV file as parameter: |
||||
|
||||
@code{.sh} |
||||
facerec_demo.exe D:/data/at.txt |
||||
@endcode |
||||
|
||||
Please, see @ref tutorial_face_appendix_csv for details on creating CSV file. |
||||
|
||||
Eigenfaces {#tutorial_face_eigenfaces} |
||||
---------- |
||||
|
||||
The problem with the image representation we are given is its high dimensionality. Two-dimensional |
||||
\f$p \times q\f$ grayscale images span a \f$m = pq\f$-dimensional vector space, so an image with |
||||
\f$100 \times 100\f$ pixels lies in a \f$10,000\f$-dimensional image space already. The question is: Are all |
||||
dimensions equally useful for us? We can only make a decision if there's any variance in data, so |
||||
what we are looking for are the components that account for most of the information. The Principal |
||||
Component Analysis (PCA) was independently proposed by [Karl |
||||
Pearson](http://en.wikipedia.org/wiki/Karl_Pearson) (1901) and [Harold |
||||
Hotelling](http://en.wikipedia.org/wiki/Harold_Hotelling) (1933) to turn a set of possibly |
||||
correlated variables into a smaller set of uncorrelated variables. The idea is, that a |
||||
high-dimensional dataset is often described by correlated variables and therefore only a few |
||||
meaningful dimensions account for most of the information. The PCA method finds the directions with |
||||
the greatest variance in the data, called principal components. |
||||
|
||||
### Algorithmic Description of Eigenfaces method {#tutorial_face_eigenfaces_algo} |
||||
|
||||
Let \f$X = \{ x_{1}, x_{2}, \ldots, x_{n} \}\f$ be a random vector with observations \f$x_i \in R^{d}\f$. |
||||
|
||||
1. Compute the mean \f$\mu\f$ |
||||
|
||||
\f[\mu = \frac{1}{n} \sum_{i=1}^{n} x_{i}\f] |
||||
|
||||
2. Compute the the Covariance Matrix S |
||||
|
||||
\f[S = \frac{1}{n} \sum_{i=1}^{n} (x_{i} - \mu) (x_{i} - \mu)^{T}`\f] |
||||
|
||||
3. Compute the eigenvalues \f$\lambda_{i}\f$ and eigenvectors \f$v_{i}\f$ of \f$S\f$ |
||||
|
||||
\f[S v_{i} = \lambda_{i} v_{i}, i=1,2,\ldots,n\f] |
||||
|
||||
4. Order the eigenvectors descending by their eigenvalue. The \f$k\f$ principal components are the |
||||
eigenvectors corresponding to the \f$k\f$ largest eigenvalues. |
||||
|
||||
The \f$k\f$ principal components of the observed vector \f$x\f$ are then given by: |
||||
|
||||
\f[y = W^{T} (x - \mu)\f] |
||||
|
||||
where \f$W = (v_{1}, v_{2}, \ldots, v_{k})\f$. |
||||
|
||||
The reconstruction from the PCA basis is given by: |
||||
|
||||
\f[x = W y + \mu\f] |
||||
|
||||
where \f$W = (v_{1}, v_{2}, \ldots, v_{k})\f$. |
||||
|
||||
The Eigenfaces method then performs face recognition by: |
||||
|
||||
- Projecting all training samples into the PCA subspace. |
||||
- Projecting the query image into the PCA subspace. |
||||
- Finding the nearest neighbor between the projected training images and the projected query |
||||
image. |
||||
|
||||
Still there's one problem left to solve. Imagine we are given \f$400\f$ images sized \f$100 \times 100\f$ |
||||
pixel. The Principal Component Analysis solves the covariance matrix \f$S = X X^{T}\f$, where |
||||
\f${size}(X) = 10000 \times 400\f$ in our example. You would end up with a \f$10000 \times 10000\f$ matrix, |
||||
roughly \f$0.8 GB\f$. Solving this problem isn't feasible, so we'll need to apply a trick. From your |
||||
linear algebra lessons you know that a \f$M \times N\f$ matrix with \f$M > N\f$ can only have \f$N - 1\f$ |
||||
non-zero eigenvalues. So it's possible to take the eigenvalue decomposition \f$S = X^{T} X\f$ of size |
||||
\f$N \times N\f$ instead: |
||||
|
||||
\f[X^{T} X v_{i} = \lambda_{i} v{i}\f] |
||||
|
||||
and get the original eigenvectors of \f$S = X X^{T}\f$ with a left multiplication of the data matrix: |
||||
|
||||
\f[X X^{T} (X v_{i}) = \lambda_{i} (X v_{i})\f] |
||||
|
||||
The resulting eigenvectors are orthogonal, to get orthonormal eigenvectors they need to be |
||||
normalized to unit length. I don't want to turn this into a publication, so please look into |
||||
@cite Duda01 for the derivation and proof of the equations. |
||||
|
||||
### Eigenfaces in OpenCV {#tutorial_face_eigenfaces_use} |
||||
|
||||
For the first source code example, I'll go through it with you. I am first giving you the whole |
||||
source code listing, and after this we'll look at the most important lines in detail. Please note: |
||||
every source code listing is commented in detail, so you should have no problems following it. |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_eigenfaces.cpp |
||||
|
||||
I've used the jet colormap, so you can see how the grayscale values are distributed within the |
||||
specific Eigenfaces. You can see, that the Eigenfaces do not only encode facial features, but also |
||||
the illumination in the images (see the left light in Eigenface \#4, right light in Eigenfaces \#5): |
||||
|
||||
 |
||||
|
||||
We've already seen, that we can reconstruct a face from its lower dimensional approximation. So |
||||
let's see how many Eigenfaces are needed for a good reconstruction. I'll do a subplot with |
||||
\f$10,30,\ldots,310\f$ Eigenfaces: |
||||
|
||||
@code{.cpp} |
||||
// Display or save the image reconstruction at some predefined steps: |
||||
for(int num_components = 10; num_components < 300; num_components+=15) { |
||||
// slice the eigenvectors from the model |
||||
Mat evs = Mat(W, Range::all(), Range(0, num_components)); |
||||
Mat projection = LDA::subspaceProject(evs, mean, images[0].reshape(1,1)); |
||||
Mat reconstruction = LDA::subspaceReconstruct(evs, mean, projection); |
||||
// Normalize the result: |
||||
reconstruction = norm_0_255(reconstruction.reshape(1, images[0].rows)); |
||||
// Display or save: |
||||
if(argc == 2) { |
||||
imshow(format("eigenface_reconstruction_%d", num_components), reconstruction); |
||||
} else { |
||||
imwrite(format("%s/eigenface_reconstruction_%d.png", output_folder.c_str(), num_components), reconstruction); |
||||
} |
||||
} |
||||
@endcode |
||||
|
||||
10 Eigenvectors are obviously not sufficient for a good image reconstruction, 50 Eigenvectors may |
||||
already be sufficient to encode important facial features. You'll get a good reconstruction with |
||||
approximately 300 Eigenvectors for the AT&T Facedatabase. There are rule of thumbs how many |
||||
Eigenfaces you should choose for a successful face recognition, but it heavily depends on the input |
||||
data. @cite Zhao03 is the perfect point to start researching for this: |
||||
|
||||
 |
||||
|
||||
Fisherfaces {#tutorial_face_fisherfaces} |
||||
----------- |
||||
|
||||
The Principal Component Analysis (PCA), which is the core of the Eigenfaces method, finds a linear |
||||
combination of features that maximizes the total variance in data. While this is clearly a powerful |
||||
way to represent data, it doesn't consider any classes and so a lot of discriminative information |
||||
*may* be lost when throwing components away. Imagine a situation where the variance in your data is |
||||
generated by an external source, let it be the light. The components identified by a PCA do not |
||||
necessarily contain any discriminative information at all, so the projected samples are smeared |
||||
together and a classification becomes impossible (see |
||||
[<http://www.bytefish.de/wiki/pca_lda_with_gnu_octave>](http://www.bytefish.de/wiki/pca_lda_with_gnu_octave) |
||||
for an example). |
||||
|
||||
The Linear Discriminant Analysis performs a class-specific dimensionality reduction and was invented |
||||
by the great statistician [Sir R. A. Fisher](http://en.wikipedia.org/wiki/Ronald_Fisher). He |
||||
successfully used it for classifying flowers in his 1936 paper *The use of multiple measurements in |
||||
taxonomic problems* @cite Fisher36 . In order to find the combination of features that separates best |
||||
between classes the Linear Discriminant Analysis maximizes the ratio of between-classes to |
||||
within-classes scatter, instead of maximizing the overall scatter. The idea is simple: same classes |
||||
should cluster tightly together, while different classes are as far away as possible from each other |
||||
in the lower-dimensional representation. This was also recognized by |
||||
[Belhumeur](http://www.cs.columbia.edu/~belhumeur/), [Hespanha](http://www.ece.ucsb.edu/~hespanha/) |
||||
and [Kriegman](http://cseweb.ucsd.edu/~kriegman/) and so they applied a Discriminant Analysis to |
||||
face recognition in @cite BHK97 . |
||||
|
||||
### Algorithmic Description of Fisherfaces method {#tutorial_face_fisherfaces_algo} |
||||
|
||||
Let \f$X\f$ be a random vector with samples drawn from \f$c\f$ classes: |
||||
|
||||
\f[\begin{align*} |
||||
X & = & \{X_1,X_2,\ldots,X_c\} \\ |
||||
X_i & = & \{x_1, x_2, \ldots, x_n\} |
||||
\end{align*}\f] |
||||
|
||||
The scatter matrices \f$S_{B}\f$ and S\_{W} are calculated as: |
||||
|
||||
\f[\begin{align*} |
||||
S_{B} & = & \sum_{i=1}^{c} N_{i} (\mu_i - \mu)(\mu_i - \mu)^{T} \\ |
||||
S_{W} & = & \sum_{i=1}^{c} \sum_{x_{j} \in X_{i}} (x_j - \mu_i)(x_j - \mu_i)^{T} |
||||
\end{align*}\f] |
||||
|
||||
, where \f$\mu\f$ is the total mean: |
||||
|
||||
\f[\mu = \frac{1}{N} \sum_{i=1}^{N} x_i\f] |
||||
|
||||
And \f$\mu_i\f$ is the mean of class \f$i \in \{1,\ldots,c\}\f$: |
||||
|
||||
\f[\mu_i = \frac{1}{|X_i|} \sum_{x_j \in X_i} x_j\f] |
||||
|
||||
Fisher's classic algorithm now looks for a projection \f$W\f$, that maximizes the class separability |
||||
criterion: |
||||
|
||||
\f[W_{opt} = \operatorname{arg\,max}_{W} \frac{|W^T S_B W|}{|W^T S_W W|}\f] |
||||
|
||||
Following @cite BHK97, a solution for this optimization problem is given by solving the General |
||||
Eigenvalue Problem: |
||||
|
||||
\f[\begin{align*} |
||||
S_{B} v_{i} & = & \lambda_{i} S_w v_{i} \nonumber \\ |
||||
S_{W}^{-1} S_{B} v_{i} & = & \lambda_{i} v_{i} |
||||
\end{align*}\f] |
||||
|
||||
There's one problem left to solve: The rank of \f$S_{W}\f$ is at most \f$(N-c)\f$, with \f$N\f$ samples and \f$c\f$ |
||||
classes. In pattern recognition problems the number of samples \f$N\f$ is almost always samller than the |
||||
dimension of the input data (the number of pixels), so the scatter matrix \f$S_{W}\f$ becomes singular |
||||
(see @cite RJ91). In @cite BHK97 this was solved by performing a Principal Component Analysis on the |
||||
data and projecting the samples into the \f$(N-c)\f$-dimensional space. A Linear Discriminant Analysis |
||||
was then performed on the reduced data, because \f$S_{W}\f$ isn't singular anymore. |
||||
|
||||
The optimization problem can then be rewritten as: |
||||
|
||||
\f[\begin{align*} |
||||
W_{pca} & = & \operatorname{arg\,max}_{W} |W^T S_T W| \\ |
||||
W_{fld} & = & \operatorname{arg\,max}_{W} \frac{|W^T W_{pca}^T S_{B} W_{pca} W|}{|W^T W_{pca}^T S_{W} W_{pca} W|} |
||||
\end{align*}\f] |
||||
|
||||
The transformation matrix \f$W\f$, that projects a sample into the \f$(c-1)\f$-dimensional space is then |
||||
given by: |
||||
|
||||
\f[W = W_{fld}^{T} W_{pca}^{T}\f] |
||||
|
||||
### Fisherfaces in OpenCV {#tutorial_face_fisherfaces_use} |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_fisherfaces.cpp |
||||
|
||||
For this example I am going to use the Yale Facedatabase A, just because the plots are nicer. Each |
||||
Fisherface has the same length as an original image, thus it can be displayed as an image. The demo |
||||
shows (or saves) the first, at most 16 Fisherfaces: |
||||
|
||||
 |
||||
|
||||
The Fisherfaces method learns a class-specific transformation matrix, so the they do not capture |
||||
illumination as obviously as the Eigenfaces method. The Discriminant Analysis instead finds the |
||||
facial features to discriminate between the persons. It's important to mention, that the performance |
||||
of the Fisherfaces heavily depends on the input data as well. Practically said: if you learn the |
||||
Fisherfaces for well-illuminated pictures only and you try to recognize faces in bad-illuminated |
||||
scenes, then method is likely to find the wrong components (just because those features may not be |
||||
predominant on bad illuminated images). This is somewhat logical, since the method had no chance to |
||||
learn the illumination. |
||||
|
||||
The Fisherfaces allow a reconstruction of the projected image, just like the Eigenfaces did. But |
||||
since we only identified the features to distinguish between subjects, you can't expect a nice |
||||
reconstruction of the original image. For the Fisherfaces method we'll project the sample image onto |
||||
each of the Fisherfaces instead. So you'll have a nice visualization, which feature each of the |
||||
Fisherfaces describes: |
||||
|
||||
@code{.cpp} |
||||
// Display or save the image reconstruction at some predefined steps: |
||||
for(int num_component = 0; num_component < min(16, W.cols); num_component++) { |
||||
// Slice the Fisherface from the model: |
||||
Mat ev = W.col(num_component); |
||||
Mat projection = LDA::subspaceProject(ev, mean, images[0].reshape(1,1)); |
||||
Mat reconstruction = LDA::subspaceReconstruct(ev, mean, projection); |
||||
// Normalize the result: |
||||
reconstruction = norm_0_255(reconstruction.reshape(1, images[0].rows)); |
||||
// Display or save: |
||||
if(argc == 2) { |
||||
imshow(format("fisherface_reconstruction_%d", num_component), reconstruction); |
||||
} else { |
||||
imwrite(format("%s/fisherface_reconstruction_%d.png", output_folder.c_str(), num_component), reconstruction); |
||||
} |
||||
} |
||||
@endcode |
||||
|
||||
The differences may be subtle for the human eyes, but you should be able to see some differences: |
||||
|
||||
 |
||||
|
||||
Local Binary Patterns Histograms {#tutorial_face_lbph} |
||||
-------------------------------- |
||||
|
||||
Eigenfaces and Fisherfaces take a somewhat holistic approach to face recognition. You treat your |
||||
data as a vector somewhere in a high-dimensional image space. We all know high-dimensionality is |
||||
bad, so a lower-dimensional subspace is identified, where (probably) useful information is |
||||
preserved. The Eigenfaces approach maximizes the total scatter, which can lead to problems if the |
||||
variance is generated by an external source, because components with a maximum variance over all |
||||
classes aren't necessarily useful for classification (see |
||||
[<http://www.bytefish.de/wiki/pca_lda_with_gnu_octave>](http://www.bytefish.de/wiki/pca_lda_with_gnu_octave)). |
||||
So to preserve some discriminative information we applied a Linear Discriminant Analysis and |
||||
optimized as described in the Fisherfaces method. The Fisherfaces method worked great... at least |
||||
for the constrained scenario we've assumed in our model. |
||||
|
||||
Now real life isn't perfect. You simply can't guarantee perfect light settings in your images or 10 |
||||
different images of a person. So what if there's only one image for each person? Our covariance |
||||
estimates for the subspace *may* be horribly wrong, so will the recognition. Remember the Eigenfaces |
||||
method had a 96% recognition rate on the AT&T Facedatabase? How many images do we actually need to |
||||
get such useful estimates? Here are the Rank-1 recognition rates of the Eigenfaces and Fisherfaces |
||||
method on the AT&T Facedatabase, which is a fairly easy image database: |
||||
|
||||
 |
||||
|
||||
So in order to get good recognition rates you'll need at least 8(+-1) images for each person and the |
||||
Fisherfaces method doesn't really help here. The above experiment is a 10-fold cross validated |
||||
result carried out with the facerec framework at: |
||||
[<https://github.com/bytefish/facerec>](https://github.com/bytefish/facerec). This is not a |
||||
publication, so I won't back these figures with a deep mathematical analysis. Please have a look |
||||
into @cite KM01 for a detailed analysis of both methods, when it comes to small training datasets. |
||||
|
||||
So some research concentrated on extracting local features from images. The idea is to not look at |
||||
the whole image as a high-dimensional vector, but describe only local features of an object. The |
||||
features you extract this way will have a low-dimensionality implicitly. A fine idea! But you'll |
||||
soon observe the image representation we are given doesn't only suffer from illumination variations. |
||||
Think of things like scale, translation or rotation in images - your local description has to be at |
||||
least a bit robust against those things. Just like SIFT, the Local Binary Patterns methodology has |
||||
its roots in 2D texture analysis. The basic idea of Local Binary Patterns is to summarize the local |
||||
structure in an image by comparing each pixel with its neighborhood. Take a pixel as center and |
||||
threshold its neighbors against. If the intensity of the center pixel is greater-equal its neighbor, |
||||
then denote it with 1 and 0 if not. You'll end up with a binary number for each pixel, just like |
||||
11001111. So with 8 surrounding pixels you'll end up with 2\^8 possible combinations, called *Local |
||||
Binary Patterns* or sometimes referred to as *LBP codes*. The first LBP operator described in |
||||
literature actually used a fixed 3 x 3 neighborhood just like this: |
||||
|
||||
 |
||||
|
||||
### Algorithmic Description of LBPH method {#tutorial_face_lbph_algo} |
||||
|
||||
A more formal description of the LBP operator can be given as: |
||||
|
||||
\f[LBP(x_c, y_c) = \sum_{p=0}^{P-1} 2^p s(i_p - i_c)\f] |
||||
|
||||
, with \f$(x_c, y_c)\f$ as central pixel with intensity \f$i_c\f$; and \f$i_n\f$ being the intensity of the the |
||||
neighbor pixel. \f$s\f$ is the sign function defined as: |
||||
|
||||
\f[\begin{equation} |
||||
s(x) = |
||||
\begin{cases} |
||||
1 & \text{if \(x \geq 0\)}\\ |
||||
0 & \text{else} |
||||
\end{cases} |
||||
\end{equation}\f] |
||||
|
||||
This description enables you to capture very fine grained details in images. In fact the authors |
||||
were able to compete with state of the art results for texture classification. Soon after the |
||||
operator was published it was noted, that a fixed neighborhood fails to encode details differing in |
||||
scale. So the operator was extended to use a variable neighborhood in @cite AHP04 . The idea is to |
||||
align an abritrary number of neighbors on a circle with a variable radius, which enables to capture |
||||
the following neighborhoods: |
||||
|
||||
 |
||||
|
||||
For a given Point \f$(x_c,y_c)\f$ the position of the neighbor \f$(x_p,y_p), p \in P\f$ can be calculated |
||||
by: |
||||
|
||||
\f[\begin{align*} |
||||
x_{p} & = & x_c + R \cos({\frac{2\pi p}{P}})\\ |
||||
y_{p} & = & y_c - R \sin({\frac{2\pi p}{P}}) |
||||
\end{align*}\f] |
||||
|
||||
Where \f$R\f$ is the radius of the circle and \f$P\f$ is the number of sample points. |
||||
|
||||
The operator is an extension to the original LBP codes, so it's sometimes called *Extended LBP* |
||||
(also referred to as *Circular LBP*) . If a points coordinate on the circle doesn't correspond to |
||||
image coordinates, the point get's interpolated. Computer science has a bunch of clever |
||||
interpolation schemes, the OpenCV implementation does a bilinear interpolation: |
||||
|
||||
\f[\begin{align*} |
||||
f(x,y) \approx \begin{bmatrix} |
||||
1-x & x \end{bmatrix} \begin{bmatrix} |
||||
f(0,0) & f(0,1) \\ |
||||
f(1,0) & f(1,1) \end{bmatrix} \begin{bmatrix} |
||||
1-y \\ |
||||
y \end{bmatrix}. |
||||
\end{align*}\f] |
||||
|
||||
By definition the LBP operator is robust against monotonic gray scale transformations. We can easily |
||||
verify this by looking at the LBP image of an artificially modified image (so you see what an LBP |
||||
image looks like!): |
||||
|
||||
 |
||||
|
||||
So what's left to do is how to incorporate the spatial information in the face recognition model. |
||||
The representation proposed by Ahonen et. al @cite AHP04 is to divide the LBP image into \f$m\f$ local |
||||
regions and extract a histogram from each. The spatially enhanced feature vector is then obtained by |
||||
concatenating the local histograms (**not merging them**). These histograms are called *Local Binary |
||||
Patterns Histograms*. |
||||
|
||||
### Local Binary Patterns Histograms in OpenCV {#tutorial_face_lbph_use} |
||||
|
||||
The source code for this demo application is also available in the src folder coming with this |
||||
documentation: |
||||
|
||||
@include face/samples/facerec_lbph.cpp |
||||
|
||||
Conclusion {#tutorial_face_conclusion} |
||||
---------- |
||||
|
||||
You've learned how to use the new FaceRecognizer in real applications. After reading the document |
||||
you also know how the algorithms work, so now it's time for you to experiment with the available |
||||
algorithms. Use them, improve them and let the OpenCV community participate! |
||||
|
||||
Credits {#tutorial_face_credits} |
||||
------- |
||||
|
||||
This document wouldn't be possible without the kind permission to use the face images of the *AT&T |
||||
Database of Faces* and the *Yale Facedatabase A/B*. |
||||
|
||||
### The Database of Faces {#tutorial_face_credits_db} |
||||
|
||||
__Important: when using these images, please give credit to "AT&T Laboratories, Cambridge."__ |
||||
|
||||
The Database of Faces, formerly *The ORL Database of Faces*, contains a set of face images taken |
||||
between April 1992 and April 1994. The database was used in the context of a face recognition |
||||
project carried out in collaboration with the Speech, Vision and Robotics Group of the Cambridge |
||||
University Engineering Department. |
||||
|
||||
There are ten different images of each of 40 distinct subjects. For some subjects, the images were |
||||
taken at different times, varying the lighting, facial expressions (open / closed eyes, smiling / |
||||
not smiling) and facial details (glasses / no glasses). All the images were taken against a dark |
||||
homogeneous background with the subjects in an upright, frontal position (with tolerance for some |
||||
side movement). |
||||
|
||||
The files are in PGM format. The size of each image is 92x112 pixels, with 256 grey levels per |
||||
pixel. The images are organised in 40 directories (one for each subject), which have names of the |
||||
form sX, where X indicates the subject number (between 1 and 40). In each of these directories, |
||||
there are ten different images of that subject, which have names of the form Y.pgm, where Y is the |
||||
image number for that subject (between 1 and 10). |
||||
|
||||
A copy of the database can be retrieved from: |
||||
[<http://www.cl.cam.ac.uk/research/dtg/attarchive/pub/data/att_faces.zip>](http://www.cl.cam.ac.uk/research/dtg/attarchive/pub/data/att_faces.zip). |
||||
|
||||
### Yale Facedatabase A {#tutorial_face_credits_yalea} |
||||
|
||||
*With the permission of the authors I am allowed to show a small number of images (say subject 1 and |
||||
all the variations) and all images such as Fisherfaces and Eigenfaces from either Yale Facedatabase |
||||
A or the Yale Facedatabase B.* |
||||
|
||||
The Yale Face Database A (size 6.4MB) contains 165 grayscale images in GIF format of 15 individuals. |
||||
There are 11 images per subject, one per different facial expression or configuration: center-light, |
||||
w/glasses, happy, left-light, w/no glasses, normal, right-light, sad, sleepy, surprised, and wink. |
||||
(Source: |
||||
[<http://cvc.yale.edu/projects/yalefaces/yalefaces.html>](http://cvc.yale.edu/projects/yalefaces/yalefaces.html)) |
||||
|
||||
### Yale Facedatabase B {#tutorial_face_credits_yaleb} |
||||
|
||||
*With the permission of the authors I am allowed to show a small number of images (say subject 1 and |
||||
all the variations) and all images such as Fisherfaces and Eigenfaces from either Yale Facedatabase |
||||
A or the Yale Facedatabase B.* |
||||
|
||||
The extended Yale Face Database B contains 16128 images of 28 human subjects under 9 poses and 64 |
||||
illumination conditions. The data format of this database is the same as the Yale Face Database B. |
||||
Please refer to the homepage of the Yale Face Database B (or one copy of this page) for more |
||||
detailed information of the data format. |
||||
|
||||
You are free to use the extended Yale Face Database B for research purposes. All publications which |
||||
use this database should acknowledge the use of "the Exteded Yale Face Database B" and reference |
||||
Athinodoros Georghiades, Peter Belhumeur, and David Kriegman's paper, "From Few to Many: |
||||
Illumination Cone Models for Face Recognition under Variable Lighting and Pose", PAMI, 2001, |
||||
[[bibtex]](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/athosref.html). |
||||
|
||||
The extended database as opposed to the original Yale Face Database B with 10 subjects was first |
||||
reported by Kuang-Chih Lee, Jeffrey Ho, and David Kriegman in "Acquiring Linear Subspaces for Face |
||||
Recognition under Variable Lighting, PAMI, May, 2005 |
||||
[[pdf]](http://vision.ucsd.edu/~leekc/papers/9pltsIEEE.pdf)." All test image data used in the |
||||
experiments are manually aligned, cropped, and then re-sized to 168x192 images. If you publish your |
||||
experimental results with the cropped images, please reference the PAMI2005 paper as well. (Source: |
||||
[<http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html>](http://vision.ucsd.edu/~leekc/ExtYaleDatabase/ExtYaleB.html)) |
||||
|
||||
Appendix {#face_appendix} |
||||
-------- |
||||
|
||||
### Creating the CSV File {#tutorial_face_appendix_csv} |
||||
|
||||
You don't really want to create the CSV file by hand. I have prepared you a little Python script |
||||
`create_csv.py` (you find it at `src/create_csv.py` coming with this tutorial) that automatically |
||||
creates you a CSV file. If you have your images in hierarchie like this |
||||
(`/basepath/<subject>/<image.ext>`): |
||||
|
||||
@code{.sh} |
||||
philipp@mango:~/facerec/data/at$ tree |
||||
. |
||||
|-- s1 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
|-- s2 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
... |
||||
|-- s40 |
||||
| |-- 1.pgm |
||||
| |-- ... |
||||
| |-- 10.pgm |
||||
@endcode |
||||
|
||||
Then simply call `create_csv.py at` , here 'at' being the basepath to the folder, just like this and you could save the |
||||
output: |
||||
|
||||
@code{.sh} |
||||
philipp@mango:~/facerec/data$ python create_csv.py at |
||||
at/s13/2.pgm;0 |
||||
at/s13/7.pgm;0 |
||||
at/s13/6.pgm;0 |
||||
at/s13/9.pgm;0 |
||||
at/s13/5.pgm;0 |
||||
at/s13/3.pgm;0 |
||||
at/s13/4.pgm;0 |
||||
at/s13/10.pgm;0 |
||||
at/s13/8.pgm;0 |
||||
at/s13/1.pgm;0 |
||||
at/s17/2.pgm;1 |
||||
at/s17/7.pgm;1 |
||||
at/s17/6.pgm;1 |
||||
at/s17/9.pgm;1 |
||||
at/s17/5.pgm;1 |
||||
at/s17/3.pgm;1 |
||||
[...] |
||||
@endcode |
||||
|
||||
Here is the script, if you can't find it: |
||||
|
||||
@verbinclude face/samples/etc/create_csv.py |
||||
|
||||
### Aligning Face Images {#tutorial_face_appendix_align} |
||||
|
||||
An accurate alignment of your image data is especially important in tasks like emotion detection, |
||||
were you need as much detail as possible. Believe me... You don't want to do this by hand. So I've |
||||
prepared you a tiny Python script. The code is really easy to use. To scale, rotate and crop the |
||||
face image you just need to call *CropFace(image, eye_left, eye_right, offset_pct, dest_sz)*, |
||||
where: |
||||
|
||||
- *eye_left* is the position of the left eye |
||||
- *eye_right* is the position of the right eye |
||||
- *offset_pct* is the percent of the image you want to keep next to the eyes (horizontal, |
||||
vertical direction) |
||||
- *dest_sz* is the size of the output image |
||||
|
||||
If you are using the same *offset_pct* and *dest_sz* for your images, they are all aligned at the |
||||
eyes. |
||||
|
||||
@verbinclude face/samples/etc/crop_face.py |
||||
|
||||
Imagine we are given [this photo of Arnold |
||||
Schwarzenegger](http://en.wikipedia.org/wiki/File:Arnold_Schwarzenegger_edit%28ws%29.jpg), which is |
||||
under a Public Domain license. The (x,y)-position of the eyes is approximately *(252,364)* for the |
||||
left and *(420,366)* for the right eye. Now you only need to define the horizontal offset, vertical |
||||
offset and the size your scaled, rotated & cropped face should have. |
||||
|
||||
Here are some examples: |
||||
|
||||
Configuration | Cropped, Scaled, Rotated Face |
||||
--------------------------------|------------------------------------------------------------------ |
||||
0.1 (10%), 0.1 (10%), (200,200) |  |
||||
0.2 (20%), 0.2 (20%), (200,200) |  |
||||
0.3 (30%), 0.3 (30%), (200,200) |  |
||||
0.2 (20%), 0.2 (20%), (70,70) |  |
||||
|
||||
### CSV for the AT&T Facedatabase {#tutorial_face_appendix_attcsv} |
||||
|
||||
@verbinclude face/samples/etc/at.txt |
||||
Basic application to swap faces using face landmark detection |
||||
|