2-nd level of parallelization + detector remake

1. Added 2-nd level of parallelization of NN on OpenCL 2. Restructured detector - now all filters work independently: Variance Filter->Ensemble->NN, through "buffers"
10 years ago · 5a4184b619
parent 38d5db71fe
commit 5a4184b619
5 changed files with 397 additions and 58 deletions
--- a/modules/tracking/samples/tld_test.cpp
+++ b/modules/tracking/samples/tld_test.cpp
@ -48,8 +48,8 @@
 using namespace std;
 using namespace cv;

-#define NUM_TEST_FRAMES 500
-#define TEST_VIDEO_INDEX 1		//TLD Dataset Video Index from 1-10
+#define NUM_TEST_FRAMES 100
+#define TEST_VIDEO_INDEX 7		//TLD Dataset Video Index from 1-10
 //#define RECORD_VIDEO_FLG

 static Mat image;
--- a/modules/tracking/src/opencl/tldDetector.cl
+++ b/modules/tracking/src/opencl/tldDetector.cl
@ -31,12 +31,11 @@ __kernel void NCC(__global const uchar *patch,
 	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
 	float sq1 = 0, sq2 = 0, ares = 0;
 	int N = 225;
-	//NCC with positive patch
+	//NCC with positive sample
 	if (posFlg && id < posNum)
 	{
 		for (int i = 0; i < N; i++)
 		{
-			
 			s1 += positiveSamples[id * N + i];
 			s2 += patch[i];
 			n1 += positiveSamples[id * N + i] * positiveSamples[id * N + i];
@ -46,10 +45,10 @@ __kernel void NCC(__global const uchar *patch,
 		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
 		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
 		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
-		ncc[id] = ares;		
+		ncc[id] = ares;
 	}

-	//NCC with negative patch
+	//NCC with negative sample
 	if (!posFlg && id < negNum)
 	{
 		for (int i = 0; i < N; i++)
@ -67,3 +66,68 @@ __kernel void NCC(__global const uchar *patch,
 		ncc[id+500] = ares;
 	}
 }
+
+__kernel void batchNCC(__global const uchar *patches,
+	__global const uchar *positiveSamples,
+	__global const uchar *negativeSamples,
+	__global float *posNcc,
+	__global float *negNcc,
+	int posNum,
+	int negNum,
+	int patchNum)
+{
+	int id = get_global_id(0);
+	bool posFlg;
+
+	if (id < 500*patchNum)
+		posFlg = true;
+	if (id >= 500*patchNum)
+	{
+		//Negative index
+		id = id - 500*patchNum;
+		posFlg = false;
+	}
+
+	int modelSampleID = id % 500;
+	int patchID = id / 500;
+
+	//Variables
+	int s1 = 0, s2 = 0, n1 = 0, n2 = 0, prod = 0;
+	float sq1 = 0, sq2 = 0, ares = 0;
+	int N = 225;
+
+	//NCC with positive sample
+	if (posFlg && modelSampleID < posNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+			s1 += positiveSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += positiveSamples[modelSampleID * N + i] * positiveSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += positiveSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		posNcc[id] = ares;
+	}
+
+	//NCC with negative sample
+	if (!posFlg && modelSampleID < negNum)
+	{
+		for (int i = 0; i < N; i++)
+		{
+
+			s1 += negativeSamples[modelSampleID * N + i];
+			s2 += patches[patchID*N + i];
+			n1 += negativeSamples[modelSampleID * N + i] * negativeSamples[modelSampleID * N + i];
+			n2 += patches[patchID*N + i] * patches[patchID*N + i];
+			prod += negativeSamples[modelSampleID * N + i] * patches[patchID*N + i];
+		}
+		sq1 = sqrt(max(0.0, n1 - 1.0 * s1 * s1 / N));
+		sq2 = sqrt(max(0.0, n2 - 1.0 * s2 * s2 / N));
+		ares = (sq2 == 0) ? sq1 / fabs(sq1) : (prod - s1 * s2 / N) / sq1 / sq2;
+		negNcc[id] = ares;
+	}
+}
--- a/modules/tracking/src/tldDetector.cpp
+++ b/modules/tracking/src/tldDetector.cpp
@ -98,7 +98,7 @@ namespace cv
 			}
 			e2 = getTickCount();
 			t = (e2 - e1) / getTickFrequency()*1000.0;
-			printf("Sr CPU: %f\n", t);
+			//printf("Sr CPU: %f\n", t);
 			if (splus + sminus == 0.0)
 				return 0.0;
 			return splus / (sminus + splus);
@ -109,16 +109,16 @@ namespace cv
 			int64 e1, e2, e3, e4;
 			float t;
 			e1 = getTickCount();
-			double splus = 0.0, sminus = 0.0;
-			
 			e3 = getTickCount();
+			double splus = 0.0, sminus = 0.0;
+

 			UMat devPatch = patch.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
 			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
 			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
 			UMat devNCC(1, 2*MAX_EXAMPLES_IN_MODEL, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);

-			
+
 			ocl::Kernel k;
 			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
 			String error;
@ -139,7 +139,7 @@ namespace cv
 			//printf("Mem Cpy GPU: %f\n", t);

 			size_t globSize = 1000;
-			size_t localSize = 128;		
+			size_t localSize = 128;
 			e3 = getTickCount();
 			if (!k.run(1, &globSize, &localSize, true))
 				printf("Kernel Run Error!!!");
@ -152,7 +152,7 @@ namespace cv
 			e4 = getTickCount();
 			t = (e4 - e3) / getTickFrequency()*1000.0;
 			//printf("Read Mem GPU: %f\n", t);
-			
+
 			////Compare
 			//Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
 			//for (int i = 0; i < 200; i+=17)
@ -167,7 +167,7 @@ namespace cv
 			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
 			//}

-			
+
 			for (int i = 0; i < *posNum; i++)
 				splus = std::max(splus, 0.5 * (resNCC.at<float>(i) + 1.0));

@ -183,6 +183,110 @@ namespace cv
 			return splus / (sminus + splus);
 		}

+		void TLDDetector::ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches)
+		{
+			int64 e1, e2, e3, e4;
+			float t;
+			e1 = getTickCount();
+			e3 = getTickCount();
+
+			UMat devPatches = patches.getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPositiveSamples = posExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegativeSamples = negExp->getUMat(ACCESS_READ, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devPosNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+			UMat devNegNCC(MAX_EXAMPLES_IN_MODEL, numOfPatches, CV_32FC1, ACCESS_RW, USAGE_ALLOCATE_DEVICE_MEMORY);
+
+			ocl::Kernel k;
+			ocl::ProgramSource src = ocl::tracking::tldDetector_oclsrc;
+			String error;
+			ocl::Program prog(src, NULL, error);
+			k.create("batchNCC", prog);
+			if (k.empty())
+				printf("Kernel create failed!!!\n");
+			k.args(
+				ocl::KernelArg::PtrReadOnly(devPatches),
+				ocl::KernelArg::PtrReadOnly(devPositiveSamples),
+				ocl::KernelArg::PtrReadOnly(devNegativeSamples),
+				ocl::KernelArg::PtrWriteOnly(devPosNCC),
+				ocl::KernelArg::PtrWriteOnly(devNegNCC),
+				posNum,
+				negNum,
+				numOfPatches);
+
+			e4 = getTickCount();
+			t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Mem Cpy GPU: %f\n", t);
+
+			// 2 -> Pos&Neg
+			size_t globSize = 2 * numOfPatches*MAX_EXAMPLES_IN_MODEL;
+			size_t localSize = 1024;
+			e3 = getTickCount();
+			if (!k.run(1, &globSize, &localSize, true))
+				printf("Kernel Run Error!!!");
+			e4 = getTickCount();
+			t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Kernel Run GPU: %f\n", t);
+
+			e3 = getTickCount();
+			Mat posNCC = devPosNCC.getMat(ACCESS_READ);
+			Mat negNCC = devNegNCC.getMat(ACCESS_READ);
+			e4 = getTickCount();
+			t = (e4 - e3) / getTickFrequency()*1000.0;
+			//printf("Read Mem GPU: %f\n", t);
+
+			//Calculate Srs
+			for (int k = 0; k < numOfPatches; k++)
+			{
+				double spr = 0.0, smr = 0.0, spc = 0.0, smc = 0;
+				int med = getMedian((*timeStampsPositive));
+				for (int i = 0; i < *posNum; i++)
+				{
+					spr = std::max(spr, 0.5 * (posNCC.at<float>(k * 500 + i) + 1.0));
+					if ((int)(*timeStampsPositive)[i] <= med)
+						spc = std::max(spr, 0.5 * (posNCC.at<float>(k * 500 + i) + 1.0));
+				}
+				for (int i = 0; i < *negNum; i++)
+					smc = smr = std::max(smr, 0.5 * (negNCC.at<float>(k * 500 + i) + 1.0));
+
+				if (spr + smr == 0.0)
+					resultSr[k] = 0.0;
+				else
+					resultSr[k] = spr / (smr + spr);
+
+				if (spc + smc == 0.0)
+					resultSc[k] = 0.0;
+				else
+					resultSc[k] = spc / (smc + spc);
+			}
+
+			////Compare positive NCCs
+			/*Mat_<uchar> modelSample(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat_<uchar> patch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			for (int j = 0; j < numOfPatches; j++)
+			{
+				for (int i = 0; i < 1; i++)
+				{
+					modelSample.data = &(posExp->data[i * 225]);
+					patch.data = &(patches.data[j * 225]);
+					printf("%f\t%f\n", resultSr[j], Sr(patch));
+					printf("%f\t%f\n", resultSc[j], Sc(patch));
+				}
+			}*/
+
+			//for (int i = 0; i < 200; i+=23)
+			//{
+			//	modelSample.data = &(negExp->data[i * 225]);
+			//	printf("%f\t%f\n", resNCC.at<float>(500+i), NCC(modelSample, patch));
+			//}
+
+
+
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Sr GPU: %f\n\n", t);
+		}
+
+
 		// Calculate Conservative similarity of the patch (NN-Model)
 		double TLDDetector::Sc(const Mat_<uchar>& patch)
 		{
@ -204,7 +308,7 @@ namespace cv
 			printf("Sc: %f\n", t);
 			if (splus + sminus == 0.0)
 				return 0.0;
-			
+
 			return splus / (sminus + splus);
 			*/

@ -229,7 +333,7 @@ namespace cv
 			}
 			e2 = getTickCount();
 			t = (e2 - e1) / getTickFrequency()*1000.0;
-			printf("Sc: %f\n", t);
+			//printf("Sc: %f\n", t);
 			if (splus + sminus == 0.0)
 				return 0.0;

@ -353,77 +457,247 @@ namespace cv
 		}

 		//Detection - returns most probable new target location (Max Sc)
+
 		bool TLDDetector::detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
 		{
 			patches.clear();
-
-			Mat resized_img, blurred_img;
 			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
-			img.copyTo(resized_img);
-			imgBlurred.copyTo(blurred_img);
+			Mat tmp;
 			int dx = initSize.width / 10, dy = initSize.height / 10;
 			Size2d size = img.size();
 			double scale = 1.0;
 			int total = 0, pass = 0;
 			int npos = 0, nneg = 0;
-			double tmp = 0, maxSc = -5.0;
+			double maxSc = -5.0;
 			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <double> varScaleIDs, ensScaleIDs;
+			int64 e1, e2;
+			double t;

+			e1 = cvGetTickCount();
 			//Detection part
+<<<<<<< HEAD
+=======
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
+>>>>>>> 2-nd level of parallelization + detector remake
 			do
 			{
 				Mat_<double> intImgP, intImgP2;
-				computeIntegralImages(resized_img, intImgP, intImgP2);
-
-				prepareClassifiers((int)blurred_img.step[0]);
-				for (int i = 0, imax = cvFloor((0.0 + resized_img.cols - initSize.width) / dx); i < imax; i++)
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
 				{
-					for (int j = 0, jmax = cvFloor((0.0 + resized_img.rows - initSize.height) / dy); j < jmax; j++)
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
 					{
-						LabeledPatch labPatch;
-						total++;
 						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						if (ensembleClassifierNum(&blurred_img.at<uchar>(dy * j, dx * i)) <= ENSEMBLE_THRESHOLD)
-							continue;
-						pass++;
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
+					}
+				}
+				scaleID++;
+				size.width /= SCALE_STEP;
+				size.height /= SCALE_STEP;
+				scale *= SCALE_STEP;
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
+			} while (size.width >= initSize.width && size.height >= initSize.height);
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			e1 = cvGetTickCount();
+			for (int i = 0; i < varBuffer.size(); i++)
+			{
+				prepareClassifiers((int)blurred_imgs[varScaleIDs[i]].step[0]);
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);
+
+			//NN classification
+			e1 = getTickCount();
+			for (int i = 0; i < ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);
+
+				double srValue, scValue;
+				srValue = Sr(standardPatch);
+
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = Sc(standardPatch);
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);

-						labPatch.rect = Rect2d(dx * i * scale, dy * j * scale, initSize.width * scale, initSize.height * scale);
-						resample(resized_img, Rect2d(Point(dx * i, dy * j), initSize), standardPatch);
-						
-						tmp = ocl_Sr(standardPatch);
+			if (maxSc < 0)
+				return false;
+			res = maxScRect;
+			return true;
+		}

-						////To fix: Check the paper, probably this cause wrong learning
-						//
-						labPatch.isObject = tmp > THETA_NN;
-						labPatch.shouldBeIntegrated = abs(tmp - THETA_NN) < 0.1;
-						patches.push_back(labPatch);
-						//
+		bool TLDDetector::ocl_detect(const Mat& img, const Mat& imgBlurred, Rect2d& res, std::vector<LabeledPatch>& patches, Size initSize)
+		{
+			patches.clear();
+			Mat_<uchar> standardPatch(STANDARD_PATCH_SIZE, STANDARD_PATCH_SIZE);
+			Mat tmp;
+			int dx = initSize.width / 10, dy = initSize.height / 10;
+			Size2d size = img.size();
+			double scale = 1.0;
+			int total = 0, pass = 0;
+			int npos = 0, nneg = 0;
+			double maxSc = -5.0;
+			Rect2d maxScRect;
+			int scaleID;
+			std::vector <Mat> resized_imgs, blurred_imgs;
+			std::vector <Point> varBuffer, ensBuffer;
+			std::vector <double> varScaleIDs, ensScaleIDs;
+			int64 e1, e2;
+			double t;

-						if (!labPatch.isObject)
-						{
-							nneg++;
+			e1 = cvGetTickCount();
+			//Detection part
+			//Generate windows and filter by variance
+			scaleID = 0;
+			resized_imgs.push_back(img);
+			blurred_imgs.push_back(imgBlurred);
+			do
+			{
+				Mat_<double> intImgP, intImgP2;
+				computeIntegralImages(resized_imgs[scaleID], intImgP, intImgP2);
+				for (int i = 0, imax = cvFloor((0.0 + resized_imgs[scaleID].cols - initSize.width) / dx); i < imax; i++)
+				{
+					for (int j = 0, jmax = cvFloor((0.0 + resized_imgs[scaleID].rows - initSize.height) / dy); j < jmax; j++)
+					{
+						if (!patchVariance(intImgP, intImgP2, originalVariancePtr, Point(dx * i, dy * j), initSize))
 							continue;
-						}
-						else
-						{
-							npos++;
-						}
-						tmp = ocl_Sc(standardPatch);
-						if (tmp > maxSc)
-						{
-							maxSc = tmp;
-							maxScRect = labPatch.rect;
-						}
+						varBuffer.push_back(Point(dx * i, dy * j));
+						varScaleIDs.push_back(scaleID);
 					}
 				}
-
+				scaleID++;
 				size.width /= SCALE_STEP;
 				size.height /= SCALE_STEP;
 				scale *= SCALE_STEP;
-				resize(img, resized_img, size, 0, 0, DOWNSCALE_MODE);
-				GaussianBlur(resized_img, blurred_img, GaussBlurKernelSize, 0.0f);
+				resize(img, tmp, size, 0, 0, DOWNSCALE_MODE);
+				resized_imgs.push_back(tmp);
+				GaussianBlur(resized_imgs[scaleID], tmp, GaussBlurKernelSize, 0.0f);
+				blurred_imgs.push_back(tmp);
 			} while (size.width >= initSize.width && size.height >= initSize.height);
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Variance: %d\t%f\n", varBuffer.size(), t);
+
+			//Encsemble classification
+			e1 = cvGetTickCount();
+			for (int i = 0; i < varBuffer.size(); i++)
+			{
+				prepareClassifiers((int)blurred_imgs[varScaleIDs[i]].step[0]);
+				if (ensembleClassifierNum(&blurred_imgs[varScaleIDs[i]].at<uchar>(varBuffer[i].y, varBuffer[i].x)) <= ENSEMBLE_THRESHOLD)
+					continue;
+				ensBuffer.push_back(varBuffer[i]);
+				ensScaleIDs.push_back(varScaleIDs[i]);
+			}
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("Ensemble: %d\t%f\n", ensBuffer.size(), t);
+
+			//NN classification
+			e1 = getTickCount();
+			//Prepare batch of patches
+			int numOfPatches = ensBuffer.size();
+			Mat_<uchar> stdPatches(numOfPatches, 225);
+			double *resultSr = new double[numOfPatches];
+			double *resultSc = new double[numOfPatches];
+
+			uchar *patchesData = stdPatches.data;
+			for (int i = 0; i < ensBuffer.size(); i++)
+			{
+				resample(resized_imgs[ensScaleIDs[i]], Rect2d(ensBuffer[i], initSize), standardPatch);
+				uchar *stdPatchData = standardPatch.data;
+				for (int j = 0; j < 225; j++)
+					patchesData[225*i+j] = stdPatchData[j];
+			}
+			//Calculate Sr and Sc batches
+			ocl_batchSrSc(stdPatches, resultSr, resultSc, numOfPatches);
+
+
+			for (int i = 0; i < ensBuffer.size(); i++)
+			{
+				LabeledPatch labPatch;
+				standardPatch.data = &stdPatches.data[225 * i];
+				double curScale = pow(SCALE_STEP, ensScaleIDs[i]);
+				labPatch.rect = Rect2d(ensBuffer[i].x*curScale, ensBuffer[i].y*curScale, initSize.width * curScale, initSize.height * curScale);
+
+				double srValue, scValue;
+
+				srValue = resultSr[i];
+
+				//srValue = Sr(standardPatch);
+				//printf("%f\t%f\t\n", srValue, resultSr[i]);
+
+				////To fix: Check the paper, probably this cause wrong learning
+				//
+				labPatch.isObject = srValue > THETA_NN;
+				labPatch.shouldBeIntegrated = abs(srValue - THETA_NN) < 0.1;
+				patches.push_back(labPatch);
+				//
+
+				if (!labPatch.isObject)
+				{
+					nneg++;
+					continue;
+				}
+				else
+				{
+					npos++;
+				}
+				scValue = resultSc[i];
+				if (scValue > maxSc)
+				{
+					maxSc = scValue;
+					maxScRect = labPatch.rect;
+				}
+			}
+			e2 = getTickCount();
+			t = (e2 - e1) / getTickFrequency()*1000.0;
+			//printf("NN: %d\t%f\n", patches.size(), t);

 			if (maxSc < 0)
 				return false;
--- a/modules/tracking/src/tldDetector.hpp
+++ b/modules/tracking/src/tldDetector.hpp
@ -77,6 +77,7 @@ namespace cv
 			double ocl_Sr(const Mat_<uchar>& patch);
 			double Sc(const Mat_<uchar>& patch);
 			double ocl_Sc(const Mat_<uchar>& patch);
+			void ocl_batchSrSc(const Mat_<uchar>& patches, double *resultSr, double *resultSc, int numOfPatches);

 			std::vector<TLDEnsembleClassifier> classifiers;
 			Mat *posExp, *negExp;
--- a/modules/tracking/src/tldTracker.cpp
+++ b/modules/tracking/src/tldTracker.cpp
@ -120,7 +120,7 @@ bool TrackerTLDImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
    {
        Rect2d tmpCandid = boundingBox;
        if( ( (i == 0) && !data->failedLastTime && trackerProxy->update(image, tmpCandid) ) ||
-			((i == 1) && (tldModel->detector->detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize()))))
+			((i == 1) && (tldModel->detector->ocl_detect(imageForDetector, image_blurred, tmpCandid, detectorResults, tldModel->getMinSize()))))
        {
            candidates.push_back(tmpCandid);
            if( i == 0 )