From 45e0e5f8e947a9c1b4a995477c43a006ec2df43f Mon Sep 17 00:00:00 2001 From: Pierre-Emmanuel Viel Date: Tue, 17 Dec 2013 12:51:58 +0100 Subject: [PATCH 1/4] Pick centers in KMeans++ with a probability proportional to their distance^2, instead of simple distance, to previous centers --- .../opencv2/flann/hierarchical_clustering_index.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h index ce2d622450..02fc278448 100644 --- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h +++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h @@ -210,8 +210,11 @@ private: assert(index >=0 && index < n); centers[0] = dsindices[index]; + // Computing distance^2 will have the advantage of even higher probability further to pick new centers + // far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article) for (int i = 0; i < n; i++) { closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols); + closestDistSq[i] *= closestDistSq[i]; currentPot += closestDistSq[i]; } @@ -237,7 +240,10 @@ private: // Compute the new potential double newPot = 0; - for (int i = 0; i < n; i++) newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols), closestDistSq[i] ); + for (int i = 0; i < n; i++) { + DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols); + newPot += std::min( dist*dist, closestDistSq[i] ); + } // Store the best result if ((bestNewPot < 0)||(newPot < bestNewPot)) { @@ -249,7 +255,10 @@ private: // Add the appropriate center centers[centerCount] = dsindices[bestNewIndex]; currentPot = bestNewPot; - for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols), closestDistSq[i] ); + for (int i = 0; i < n; i++) { + DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols); + closestDistSq[i] = std::min( dist*dist, closestDistSq[i] ); + } } centers_length = centerCount; From 5aeeaa6fce4016fd626f31f56025cf83ff07576a Mon Sep 17 00:00:00 2001 From: Pierre-Emmanuel Viel Date: Tue, 17 Dec 2013 13:04:49 +0100 Subject: [PATCH 2/4] Apply to KMeansIndex KMeanspp the same modification as in HierarchicalClusteringIndex --- modules/flann/include/opencv2/flann/kmeans_index.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h index 3fea956a74..3bf12047cd 100644 --- a/modules/flann/include/opencv2/flann/kmeans_index.h +++ b/modules/flann/include/opencv2/flann/kmeans_index.h @@ -211,6 +211,7 @@ public: for (int i = 0; i < n; i++) { closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); + closestDistSq[i] *= closestDistSq[i]; currentPot += closestDistSq[i]; } @@ -236,7 +237,10 @@ public: // Compute the new potential double newPot = 0; - for (int i = 0; i < n; i++) newPot += std::min( distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols), closestDistSq[i] ); + for (int i = 0; i < n; i++) { + DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); + newPot += std::min( dist*dist, closestDistSq[i] ); + } // Store the best result if ((bestNewPot < 0)||(newPot < bestNewPot)) { @@ -248,7 +252,10 @@ public: // Add the appropriate center centers[centerCount] = indices[bestNewIndex]; currentPot = bestNewPot; - for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols), closestDistSq[i] ); + for (int i = 0; i < n; i++) { + DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols); + closestDistSq[i] = std::min( dist*dist, closestDistSq[i] ); + } } centers_length = centerCount; From fa749de0dcb27d3b666eda341b43e8c13f66be8e Mon Sep 17 00:00:00 2001 From: Pierre-Emmanuel Viel Date: Tue, 17 Dec 2013 13:26:55 +0100 Subject: [PATCH 3/4] As some processed distances are already ^2, use template to select whether or not we have to ^2 in KMeanspp --- .../include/opencv2/flann/kmeans_index.h | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h index 3bf12047cd..460dc64be9 100644 --- a/modules/flann/include/opencv2/flann/kmeans_index.h +++ b/modules/flann/include/opencv2/flann/kmeans_index.h @@ -53,6 +53,62 @@ namespace cvflann { +template +struct squareDistance +{ + typedef typename Distance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist*dist; } +}; + + +template +struct squareDistance, ElementType> +{ + typedef typename L2_Simple::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename L2::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + + +template +struct squareDistance, ElementType> +{ + typedef typename MinkowskiDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename HellingerDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename ChiSquareDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + + +template +typename Distance::ResultType ensureSquareDistance( typename Distance::ResultType dist ) +{ + typedef typename Distance::ElementType ElementType; + + squareDistance dummy; + return dummy( dist ); +} + + + struct KMeansIndexParams : public IndexParams { KMeansIndexParams(int branching = 32, int iterations = 11, @@ -211,7 +267,7 @@ public: for (int i = 0; i < n; i++) { closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); - closestDistSq[i] *= closestDistSq[i]; + closestDistSq[i] = ensureSquareDistance( closestDistSq[i] ); currentPot += closestDistSq[i]; } @@ -239,7 +295,7 @@ public: double newPot = 0; for (int i = 0; i < n; i++) { DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); - newPot += std::min( dist*dist, closestDistSq[i] ); + newPot += std::min( ensureSquareDistance(dist), closestDistSq[i] ); } // Store the best result @@ -254,7 +310,7 @@ public: currentPot = bestNewPot; for (int i = 0; i < n; i++) { DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols); - closestDistSq[i] = std::min( dist*dist, closestDistSq[i] ); + closestDistSq[i] = std::min( ensureSquareDistance(dist), closestDistSq[i] ); } } From 0d19685f9544ddf2668fa899ce74580fd9d1039f Mon Sep 17 00:00:00 2001 From: Pierre-Emmanuel Viel Date: Tue, 17 Dec 2013 13:34:20 +0100 Subject: [PATCH 4/4] Move templates in dist.h in order to share them between KMeansIndex and HierarchicalClusteringIndex classes. --- modules/flann/include/opencv2/flann/dist.h | 60 +++++++++++++++++++ .../flann/hierarchical_clustering_index.h | 6 +- .../include/opencv2/flann/kmeans_index.h | 56 ----------------- 3 files changed, 63 insertions(+), 59 deletions(-) diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h index 80ae2dc916..2afceb8893 100644 --- a/modules/flann/include/opencv2/flann/dist.h +++ b/modules/flann/include/opencv2/flann/dist.h @@ -812,6 +812,66 @@ struct ZeroIterator }; + +/* + * Depending on processed distances, some of them are already squared (e.g. L2) + * and some are not (e.g.Hamming). In KMeans++ for instance we want to be sure + * we are working on ^2 distances, thus following templates to ensure that. + */ +template +struct squareDistance +{ + typedef typename Distance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist*dist; } +}; + + +template +struct squareDistance, ElementType> +{ + typedef typename L2_Simple::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename L2::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + + +template +struct squareDistance, ElementType> +{ + typedef typename MinkowskiDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename HellingerDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + +template +struct squareDistance, ElementType> +{ + typedef typename ChiSquareDistance::ResultType ResultType; + ResultType operator()( ResultType dist ) { return dist; } +}; + + +template +typename Distance::ResultType ensureSquareDistance( typename Distance::ResultType dist ) +{ + typedef typename Distance::ElementType ElementType; + + squareDistance dummy; + return dummy( dist ); +} + } #endif //OPENCV_FLANN_DIST_H_ diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h index 02fc278448..3ccfa5534b 100644 --- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h +++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h @@ -214,7 +214,7 @@ private: // far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article) for (int i = 0; i < n; i++) { closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols); - closestDistSq[i] *= closestDistSq[i]; + closestDistSq[i] = ensureSquareDistance( closestDistSq[i] ); currentPot += closestDistSq[i]; } @@ -242,7 +242,7 @@ private: double newPot = 0; for (int i = 0; i < n; i++) { DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols); - newPot += std::min( dist*dist, closestDistSq[i] ); + newPot += std::min( ensureSquareDistance(dist), closestDistSq[i] ); } // Store the best result @@ -257,7 +257,7 @@ private: currentPot = bestNewPot; for (int i = 0; i < n; i++) { DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols); - closestDistSq[i] = std::min( dist*dist, closestDistSq[i] ); + closestDistSq[i] = std::min( ensureSquareDistance(dist), closestDistSq[i] ); } } diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h index 460dc64be9..3cbee24404 100644 --- a/modules/flann/include/opencv2/flann/kmeans_index.h +++ b/modules/flann/include/opencv2/flann/kmeans_index.h @@ -53,62 +53,6 @@ namespace cvflann { -template -struct squareDistance -{ - typedef typename Distance::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist*dist; } -}; - - -template -struct squareDistance, ElementType> -{ - typedef typename L2_Simple::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist; } -}; - -template -struct squareDistance, ElementType> -{ - typedef typename L2::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist; } -}; - - -template -struct squareDistance, ElementType> -{ - typedef typename MinkowskiDistance::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist; } -}; - -template -struct squareDistance, ElementType> -{ - typedef typename HellingerDistance::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist; } -}; - -template -struct squareDistance, ElementType> -{ - typedef typename ChiSquareDistance::ResultType ResultType; - ResultType operator()( ResultType dist ) { return dist; } -}; - - -template -typename Distance::ResultType ensureSquareDistance( typename Distance::ResultType dist ) -{ - typedef typename Distance::ElementType ElementType; - - squareDistance dummy; - return dummy( dist ); -} - - - struct KMeansIndexParams : public IndexParams { KMeansIndexParams(int branching = 32, int iterations = 11,