From 31dc3e9256a4c07d58bf4c03da23d8897009ae93 Mon Sep 17 00:00:00 2001
From: pemmanuelviel
Date: Tue, 1 Sep 2020 22:38:21 +0200
Subject: [PATCH] Merge pull request #18211 from
pemmanuelviel:pev--handle-dna-vectors
* DNA-mode: update miniflann to handle DNA
* DNA-mode: update hierarchical kmeans to handle DNA sequences
---
.../include/opencv2/flann/kmeans_index.h | 222 +++++++++++++++++-
modules/flann/src/miniflann.cpp | 25 +-
2 files changed, 244 insertions(+), 3 deletions(-)
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index f96b0f81fc..6f63629d05 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -50,6 +50,9 @@
#include "logger.h"
#define BITS_PER_CHAR 8
+#define BITS_PER_BASE 2 // for DNA/RNA sequences
+#define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE)
+#define HISTOS_PER_BASE (1<(
+ veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
+
+ unsigned long long variance = 0ull;
+ unsigned int* histograms = new unsigned int[histos_veclen];
+ memset(histograms, 0, sizeof(unsigned int)*histos_veclen);
+
+ for (unsigned int i=0; i( ensureSquareDistance(
+ distance_(dataset_[indices[i]], ZeroIterator(), veclen_)));
+
+ unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+ for (size_t k=0, l=0; k>2) & 0x03)]++;
+ histograms[k + 8 + ((vec[l]>>4) & 0x03)]++;
+ histograms[k +12 + ((vec[l]>>6) & 0x03)]++;
+ }
+ }
+
+ CentersType* mean = new CentersType[veclen_];
+ memoryCounter_ += int(veclen_*sizeof(CentersType));
+ unsigned char* char_mean = (unsigned char*)mean;
+ unsigned int* h = histograms;
+ for (size_t k=0, l=0; k h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
+ : h[k] > h[k+3] ? 0x00 : 0x11
+ : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
+ : h[k+1] > h[k+3] ? 0x01 : 0x11)
+ | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
+ : h[k+4] > h[k+7] ? 0x00 : 0x1100
+ : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
+ : h[k+5] > h[k+7] ? 0x0100 : 0x1100)
+ | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
+ : h[k+8] >h[k+11] ? 0x00 : 0x110000
+ : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
+ : h[k+9] >h[k+11] ? 0x010000 : 0x110000)
+ | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
+ : h[k+12] >h[k+15] ? 0x00 : 0x11000000
+ : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
+ : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
+ }
+ variance = static_cast(
+ 0.5 + static_cast(variance) / static_cast(indices_length));
+ variance -= static_cast(
+ ensureSquareDistance(
+ distance_(mean, ZeroIterator(), veclen_)));
+
+ DistanceType radius = 0;
+ for (unsigned int i=0; iradius) {
+ radius = tmp;
+ }
+ }
+
+ node->variance = static_cast(variance);
+ node->radius = radius;
+ node->pivot = mean;
+
+ delete[] histograms;
+ }
+
+
template
void computeNodeStatistics(KMeansNodePtr node, int* indices,
unsigned int indices_length,
@@ -858,6 +928,22 @@ private:
computeBitfieldNodeStatistics(node, indices, indices_length);
}
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const cvflann::DNAmmingLUT* identifier)
+ {
+ (void)identifier;
+ computeDnaNodeStatistics(node, indices, indices_length);
+ }
+
+ void computeNodeStatistics(KMeansNodePtr node, int* indices,
+ unsigned int indices_length,
+ const cvflann::DNAmming2* identifier)
+ {
+ (void)identifier;
+ computeDnaNodeStatistics(node, indices, indices_length);
+ }
+
void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
std::vector& radiuses, int* belongs_to, int* count)
@@ -1051,6 +1137,112 @@ private:
}
+ void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers,
+ std::vector& radiuses, int* belongs_to, int* count)
+ {
+ for (int i=0; i(
+ veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
+ cv::AutoBuffer histos_buf(branching*histos_veclen);
+ Matrix histos(histos_buf.data(), branching, histos_veclen);
+
+ bool converged = false;
+ int iteration = 0;
+ while (!converged && iteration>2) & 0x03)]++;
+ h[k + 8 + ((vec[l]>>4) & 0x03)]++;
+ h[k +12 + ((vec[l]>>6) & 0x03)]++;
+ }
+ }
+ for (int i=0; i h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10
+ : h[k] > h[k+3] ? 0x00 : 0x11
+ : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
+ : h[k+1] > h[k+3] ? 0x01 : 0x11)
+ | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000
+ : h[k+4] > h[k+7] ? 0x00 : 0x1100
+ : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
+ : h[k+5] > h[k+7] ? 0x0100 : 0x1100)
+ | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000
+ : h[k+8] >h[k+11] ? 0x00 : 0x110000
+ : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
+ : h[k+9] >h[k+11] ? 0x010000 : 0x110000)
+ | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000
+ : h[k+12] >h[k+15] ? 0x00 : 0x11000000
+ : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
+ : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
+ }
+ }
+
+ std::vector new_centroids(indices_length);
+ std::vector dists(indices_length);
+
+ // reassign points to clusters
+ KMeansDistanceComputer invoker(
+ distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+ parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+ for (int i=0; i < indices_length; ++i) {
+ DistanceType dist(dists[i]);
+ int new_centroid(new_centroids[i]);
+ if (dist > radiuses[new_centroid]) {
+ radiuses[new_centroid] = dist;
+ }
+ if (new_centroid != belongs_to[i]) {
+ count[belongs_to[i]]--;
+ count[new_centroid]++;
+ belongs_to[i] = new_centroid;
+ converged = false;
+ }
+ }
+
+ for (int i=0; i& radiuses, int* belongs_to, int* count)
@@ -1150,7 +1342,7 @@ private:
/**
* The methods responsible with doing the recursive hierarchical clustering on
* binary vectors.
- * As some might have heared that KMeans on binary data doesn't make sense,
+ * As some might have heard that KMeans on binary data doesn't make sense,
* it's worth a little explanation why it actually fairly works. As
* with the Hierarchical Clustering algortihm, we seed several centers for the
* current node by picking some of its points. Then in a first pass each point
@@ -1233,6 +1425,34 @@ private:
}
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier)
+ {
+ (void)identifier;
+ refineDnaClustering(
+ indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
+
+ void refineAndSplitClustering(
+ KMeansNodePtr node, int* indices, int indices_length, int branching,
+ int level, CentersType** centers, std::vector& radiuses,
+ int* belongs_to, int* count, const cvflann::DNAmming2* identifier)
+ {
+ (void)identifier;
+ refineDnaClustering(
+ indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+ computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+ level, centers, radiuses, belongs_to, count);
+ }
+
+
/**
* The method responsible with actually doing the recursive hierarchical
* clustering
diff --git a/modules/flann/src/miniflann.cpp b/modules/flann/src/miniflann.cpp
index 1f698e7871..a1146ec2e7 100644
--- a/modules/flann/src/miniflann.cpp
+++ b/modules/flann/src/miniflann.cpp
@@ -345,6 +345,7 @@ typedef ::cvflann::Hamming HammingDistance;
#else
typedef ::cvflann::HammingLUT HammingDistance;
#endif
+typedef ::cvflann::DNAmming2 DNAmmingDistance;
Index::Index()
{
@@ -397,6 +398,9 @@ void Index::build(InputArray _data, const IndexParams& params, flann_distance_t
buildIndex< ::cvflann::L1 >(index, data, params);
break;
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ buildIndex< DNAmmingDistance >(index, data, params);
+ break;
case FLANN_DIST_MAX:
buildIndex< ::cvflann::MaxDistance >(index, data, params);
break;
@@ -452,6 +456,9 @@ void Index::release()
deleteIndex< ::cvflann::L1 >(index);
break;
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ deleteIndex< DNAmmingDistance >(index);
+ break;
case FLANN_DIST_MAX:
deleteIndex< ::cvflann::MaxDistance >(index);
break;
@@ -573,7 +580,8 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
CV_INSTRUMENT_REGION();
Mat query = _query.getMat(), indices, dists;
- int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
+ int dtype = (distType == FLANN_DIST_HAMMING)
+ || (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
createIndicesDists( _indices, _dists, indices, dists, query.rows, knn, knn, dtype );
@@ -589,6 +597,9 @@ void Index::knnSearch(InputArray _query, OutputArray _indices,
runKnnSearch< ::cvflann::L1 >(index, query, indices, dists, knn, params);
break;
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ runKnnSearch(index, query, indices, dists, knn, params);
+ break;
case FLANN_DIST_MAX:
runKnnSearch< ::cvflann::MaxDistance >(index, query, indices, dists, knn, params);
break;
@@ -617,7 +628,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
CV_INSTRUMENT_REGION();
Mat query = _query.getMat(), indices, dists;
- int dtype = distType == FLANN_DIST_HAMMING ? CV_32S : CV_32F;
+ int dtype = (distType == FLANN_DIST_HAMMING)
+ || (distType == FLANN_DIST_DNAMMING) ? CV_32S : CV_32F;
CV_Assert( maxResults > 0 );
createIndicesDists( _indices, _dists, indices, dists, query.rows, maxResults, INT_MAX, dtype );
@@ -634,6 +646,8 @@ int Index::radiusSearch(InputArray _query, OutputArray _indices,
case FLANN_DIST_L1:
return runRadiusSearch< ::cvflann::L1 >(index, query, indices, dists, radius, params);
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ return runRadiusSearch< DNAmmingDistance >(index, query, indices, dists, radius, params);
case FLANN_DIST_MAX:
return runRadiusSearch< ::cvflann::MaxDistance >(index, query, indices, dists, radius, params);
case FLANN_DIST_HIST_INTERSECT:
@@ -697,6 +711,9 @@ void Index::save(const String& filename) const
saveIndex< ::cvflann::L1 >(this, index, fout);
break;
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ saveIndex< DNAmmingDistance >(this, index, fout);
+ break;
case FLANN_DIST_MAX:
saveIndex< ::cvflann::MaxDistance >(this, index, fout);
break;
@@ -778,6 +795,7 @@ bool Index::load(InputArray _data, const String& filename)
distType = (flann_distance_t)idistType;
if( !((distType == FLANN_DIST_HAMMING && featureType == CV_8U) ||
+ (distType == FLANN_DIST_DNAMMING && featureType == CV_8U) ||
(distType != FLANN_DIST_HAMMING && featureType == CV_32F)) )
{
fprintf(stderr, "Reading FLANN index error: unsupported feature type %d for the index type %d\n", featureType, algo);
@@ -797,6 +815,9 @@ bool Index::load(InputArray _data, const String& filename)
loadIndex< ::cvflann::L1 >(this, index, data, fin);
break;
#if MINIFLANN_SUPPORT_EXOTIC_DISTANCE_TYPES
+ case FLANN_DIST_DNAMMING:
+ loadIndex< DNAmmingDistance >(this, index, data, fin);
+ break;
case FLANN_DIST_MAX:
loadIndex< ::cvflann::MaxDistance >(this, index, data, fin);
break;