\section{Clustering} |
\ifCPy |
\cvCPyFunc{KMeans2} |
Splits set of vectors by a given number of clusters. |
\cvdefC{int cvKMeans2(const CvArr* samples, int nclusters,\par |
CvArr* labels, CvTermCriteria termcrit,\par |
int attempts=1, CvRNG* rng=0, \par |
int flags=0, CvArr* centers=0,\par |
double* compactness=0);} |
\cvdefPy{KMeans2(samples,nclusters,labels,termcrit)-> None} |
\begin{description} |
\cvarg{samples}{Floating-point matrix of input samples, one row per sample} |
\cvarg{nclusters}{Number of clusters to split the set by} |
\cvarg{labels}{Output integer vector storing cluster indices for every sample} |
\cvarg{termcrit}{Specifies maximum number of iterations and/or accuracy (distance the centers can move by between subsequent iterations)} |
\ifC |
\cvarg{attempts}{How many times the algorithm is executed using different initial labelings. The algorithm returns labels that yield the best compactness (see the last function parameter)} |
\cvarg{rng}{Optional external random number generator; can be used to fully control the function behaviour} |
\cvarg{flags}{Can be 0 or \texttt{CV\_KMEANS\_USE\_INITIAL\_LABELS}. The latter |
value means that during the first (and possibly the only) attempt, the |
function uses the user-supplied labels as the initial approximation |
instead of generating random labels. For the second and further attempts, |
the function will use randomly generated labels in any case} |
\cvarg{centers}{The optional output array of the cluster centers} |
\cvarg{compactness}{The optional output parameter, which is computed as |
$\sum_i ||\texttt{samples}_i - \texttt{centers}_{\texttt{labels}_i}||^2$ |
after every attempt; the best (minimum) value is chosen and the |
corresponding labels are returned by the function. Basically, the |
user can use only the core of the function, set the number of |
attempts to 1, initialize labels each time using a custom algorithm |
(\texttt{flags=CV\_KMEAN\_USE\_INITIAL\_LABELS}) and, based on the output compactness |
or any other criteria, choose the best clustering.} |
\fi |
\end{description} |
The function \texttt{cvKMeans2} implements a k-means algorithm that finds the |
centers of \texttt{nclusters} clusters and groups the input samples |
around the clusters. On output, $\texttt{labels}_i$ contains a cluster index for |
samples stored in the i-th row of the \texttt{samples} matrix. |
\ifC |
% Example: Clustering random samples of multi-gaussian distribution with k-means |
\begin{lstlisting} |
#include "cxcore.h" |
#include "highgui.h" |
void main( int argc, char** argv ) |
{ |
#define MAX_CLUSTERS 5 |
CvScalar color_tab[MAX_CLUSTERS]; |
IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 ); |
CvRNG rng = cvRNG(0xffffffff); |
color_tab[0] = CV_RGB(255,0,0); |
color_tab[1] = CV_RGB(0,255,0); |
color_tab[2] = CV_RGB(100,100,255); |
color_tab[3] = CV_RGB(255,0,255); |
color_tab[4] = CV_RGB(255,255,0); |
cvNamedWindow( "clusters", 1 ); |
for(;;) |
{ |
int k, cluster_count = cvRandInt(&rng)%MAX_CLUSTERS + 1; |
int i, sample_count = cvRandInt(&rng)%1000 + 1; |
CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 ); |
CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 ); |
/* generate random sample from multigaussian distribution */ |
for( k = 0; k < cluster_count; k++ ) |
{ |
CvPoint center; |
CvMat point_chunk; |
center.x = cvRandInt(&rng)%img->width; |
center.y = cvRandInt(&rng)%img->height; |
cvGetRows( points, |
&point_chunk, |
k*sample_count/cluster_count, |
(k == (cluster_count - 1)) ? |
sample_count : |
(k+1)*sample_count/cluster_count ); |
cvRandArr( &rng, &point_chunk, CV_RAND_NORMAL, |
cvScalar(center.x,center.y,0,0), |
cvScalar(img->width/6, img->height/6,0,0) ); |
} |
/* shuffle samples */ |
for( i = 0; i < sample_count/2; i++ ) |
{ |
CvPoint2D32f* pt1 = |
(CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; |
CvPoint2D32f* pt2 = |
(CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; |
CvPoint2D32f temp; |
CV_SWAP( *pt1, *pt2, temp ); |
} |
cvKMeans2( points, cluster_count, clusters, |
cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0 )); |
cvZero( img ); |
for( i = 0; i < sample_count; i++ ) |
{ |
CvPoint2D32f pt = ((CvPoint2D32f*)points->data.fl)[i]; |
int cluster_idx = clusters->data.i[i]; |
cvCircle( img, |
cvPointFrom32f(pt), |
2, |
color_tab[cluster_idx], |
} |
cvReleaseMat( &points ); |
cvReleaseMat( &clusters ); |
cvShowImage( "clusters", img ); |
int key = cvWaitKey(0); |
if( key == 27 ) |
break; |
} |
} |
\end{lstlisting} |
\cvCPyFunc{SeqPartition} |
Splits a sequence into equivalency classes. |
\cvdefC{ |
int cvSeqPartition( \par const CvSeq* seq,\par CvMemStorage* storage,\par CvSeq** labels,\par CvCmpFunc is\_equal,\par void* userdata ); |
} |
\begin{description} |
\cvarg{seq}{The sequence to partition} |
\cvarg{storage}{The storage block to store the sequence of equivalency classes. If it is NULL, the function uses \texttt{seq->storage} for output labels} |
\cvarg{labels}{Ouput parameter. Double pointer to the sequence of 0-based labels of input sequence elements} |
\cvarg{is\_equal}{The relation function that should return non-zero if the two particular sequence elements are from the same class, and zero otherwise. The partitioning algorithm uses transitive closure of the relation function as an equivalency critria} |
\cvarg{userdata}{Pointer that is transparently passed to the \texttt{is\_equal} function} |
\end{description} |
\begin{lstlisting} |
typedef int (CV_CDECL* CvCmpFunc)(const void* a, const void* b, void* userdata); |
\end{lstlisting} |
The function \texttt{cvSeqPartition} implements a quadratic algorithm for |
splitting a set into one or more equivalancy classes. The function |
returns the number of equivalency classes. |
% Example: Partitioning a 2d point set |
\begin{lstlisting} |
#include "cxcore.h" |
#include "highgui.h" |
#include <stdio.h> |
CvSeq* point_seq = 0; |
IplImage* canvas = 0; |
CvScalar* colors = 0; |
int pos = 10; |
int is_equal( const void* _a, const void* _b, void* userdata ) |
{ |
CvPoint a = *(const CvPoint*)_a; |
CvPoint b = *(const CvPoint*)_b; |
double threshold = *(double*)userdata; |
return (double)((a.x - b.x)*(a.x - b.x) + (a.y - b.y)*(a.y - b.y)) <= |
threshold; |
} |
void on_track( int pos ) |
{ |
CvSeq* labels = 0; |
double threshold = pos*pos; |
int i, class_count = cvSeqPartition( point_seq, |
0, |
&labels, |
is_equal, |
&threshold ); |
printf("%4d classes\n", class_count ); |
cvZero( canvas ); |
for( i = 0; i < labels->total; i++ ) |
{ |
CvPoint pt = *(CvPoint*)cvGetSeqElem( point_seq, i ); |
CvScalar color = colors[*(int*)cvGetSeqElem( labels, i )]; |
cvCircle( canvas, pt, 1, color, -1 ); |
} |
cvShowImage( "points", canvas ); |
} |
int main( int argc, char** argv ) |
{ |
CvMemStorage* storage = cvCreateMemStorage(0); |
point_seq = cvCreateSeq( CV_32SC2, |
sizeof(CvSeq), |
sizeof(CvPoint), |
storage ); |
CvRNG rng = cvRNG(0xffffffff); |
int width = 500, height = 500; |
int i, count = 1000; |
canvas = cvCreateImage( cvSize(width,height), 8, 3 ); |
colors = (CvScalar*)cvAlloc( count*sizeof(colors[0]) ); |
for( i = 0; i < count; i++ ) |
{ |
CvPoint pt; |
int icolor; |
pt.x = cvRandInt( &rng ) % width; |
pt.y = cvRandInt( &rng ) % height; |
cvSeqPush( point_seq, &pt ); |
icolor = cvRandInt( &rng ) | 0x00404040; |
colors[i] = CV_RGB(icolor & 255, |
(icolor >> 8)&255, |
(icolor >> 16)&255); |
} |
cvNamedWindow( "points", 1 ); |
cvCreateTrackbar( "threshold", "points", &pos, 50, on_track ); |
on_track(pos); |
cvWaitKey(0); |
return 0; |
} |
\end{lstlisting} |
\fi |
\fi |
\ifCpp |
\cvCppFunc{kmeans} |
Finds the centers of clusters and groups the input samples around the clusters. |
\cvdefCpp{double kmeans( const Mat\& samples, int clusterCount, Mat\& labels,\par |
TermCriteria termcrit, int attempts,\par |
int flags, Mat* centers );} |
\begin{description} |
\cvarg{samples}{Floating-point matrix of input samples, one row per sample} |
\cvarg{clusterCount}{The number of clusters to split the set by} |
\cvarg{labels}{The input/output integer array that will store the cluster indices for every sample} |
\cvarg{termcrit}{Specifies maximum number of iterations and/or accuracy (distance the centers can move by between subsequent iterations)} |
\cvarg{attempts}{How many times the algorithm is executed using different initial labelings. The algorithm returns the labels that yield the best compactness (see the last function parameter)} |
\cvarg{flags}{It can take the following values: |
\begin{description} |
\cvarg{KMEANS\_RANDOM\_CENTERS}{Random initial centers are selected in each attempt} |
\cvarg{KMEANS\_PP\_CENTERS}{Use kmeans++ center initialization by Arthur and Vassilvitskii} |
\cvarg{KMEANS\_USE\_INITIAL\_LABELS}{During the first (and possibly the only) attempt, the |
function uses the user-supplied labels instaed of computing them from the initial centers. For the second and further attempts, the function will use the random or semi-random centers (use one of \texttt{KMEANS\_*\_CENTERS} flag to specify the exact method)} |
\end{description}} |
\cvarg{centers}{The output matrix of the cluster centers, one row per each cluster center} |
\end{description} |
The function \texttt{kmeans} implements a k-means algorithm that finds the |
centers of \texttt{clusterCount} clusters and groups the input samples |
around the clusters. On output, $\texttt{labels}_i$ contains a 0-based cluster index for |
the sample stored in the $i^{th}$ row of the \texttt{samples} matrix. |
The function returns the compactness measure, which is computed as |
\[ |
\sum_i \|\texttt{samples}_i - \texttt{centers}_{\texttt{labels}_i}\|^2 |
\] |
after every attempt; the best (minimum) value is chosen and the |
corresponding labels and the compactness value are returned by the function. |
Basically, the user can use only the core of the function, set the number of |
attempts to 1, initialize labels each time using some custom algorithm and pass them with |
\par (\texttt{flags}=\texttt{KMEANS\_USE\_INITIAL\_LABELS}) flag, and then choose the best (most-compact) clustering. |
\cvCppFunc{partition} |
Splits an element set into equivalency classes. |
\cvdefCpp{template<typename \_Tp, class \_EqPredicate> int\newline |
partition( const vector<\_Tp>\& vec, vector<int>\& labels,\par |
\_EqPredicate predicate=\_EqPredicate());} |
\begin{description} |
\cvarg{vec}{The set of elements stored as a vector} |
\cvarg{labels}{The output vector of labels; will contain as many elements as \texttt{vec}. Each label \texttt{labels[i]} is 0-based cluster index of \texttt{vec[i]}} |
\cvarg{predicate}{The equivalence predicate (i.e. pointer to a boolean function of two arguments or an instance of the class that has the method \texttt{bool operator()(const \_Tp\& a, const \_Tp\& b)}. The predicate returns true when the elements are certainly if the same class, and false if they may or may not be in the same class} |
\end{description} |
The generic function \texttt{partition} implements an $O(N^2)$ algorithm for |
splitting a set of $N$ elements into one or more equivalency classes, as described in \url{}. The function |
returns the number of equivalency classes. |