mirror of https://github.com/opencv/opencv.git
Fast deep-copy-transpose implemented which attempts to not thrash the cache. Good first step for now, but no transpose at all would be preferrable. Started I/O.
parent
d126263983
commit
346f7d0f3e
6 changed files with 261 additions and 34 deletions
@ -0,0 +1,98 @@ |
||||
#ifndef OPENCV_TRANSPOSE_HPP_ |
||||
#define OPENCV_TRANSPOSE_HPP_ |
||||
|
||||
template <typename InputScalar, typename OutputScalar> |
||||
void transposeBlock(const size_t M, const size_t N, const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) { |
||||
InputScalar cache[16]; |
||||
// copy the source into the cache contiguously
|
||||
for (size_t n = 0; n < N; ++n) |
||||
for (size_t m = 0; m < M; ++m) |
||||
cache[m+n*4] = src[m+n*lda]; |
||||
// copy the destination out of the cache contiguously
|
||||
for (size_t m = 0; m < M; ++m) |
||||
for (size_t n = 0; n < N; ++n)
|
||||
dst[n+m*ldb] = cache[m+n*4]; |
||||
} |
||||
|
||||
template <typename InputScalar, typename OutputScalar> |
||||
void transpose4x4(const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) { |
||||
InputScalar cache[16]; |
||||
// copy the source into the cache contiguously
|
||||
cache[0] = src[0]; cache[1] = src[1]; cache[2] = src[2]; cache[3] = src[3]; src+=lda; |
||||
cache[4] = src[0]; cache[5] = src[1]; cache[6] = src[2]; cache[7] = src[3]; src+=lda; |
||||
cache[8] = src[0]; cache[9] = src[1]; cache[10] = src[2]; cache[11] = src[3]; src+=lda; |
||||
cache[12] = src[0]; cache[13] = src[1]; cache[14] = src[2]; cache[15] = src[3]; src+=lda; |
||||
// copy the destination out of the contiguously
|
||||
dst[0] = cache[0]; dst[1] = cache[4]; dst[2] = cache[8]; dst[3] = cache[12]; dst+=ldb; |
||||
dst[0] = cache[1]; dst[1] = cache[5]; dst[2] = cache[9]; dst[3] = cache[13]; dst+=ldb; |
||||
dst[0] = cache[2]; dst[1] = cache[6]; dst[2] = cache[10]; dst[3] = cache[14]; dst+=ldb; |
||||
dst[0] = cache[3]; dst[1] = cache[7]; dst[2] = cache[11]; dst[3] = cache[15]; dst+=ldb; |
||||
} |
||||
|
||||
|
||||
/*
|
||||
* Vanilla copy, transpose and cast |
||||
*/ |
||||
template <typename InputScalar, typename OutputScalar> |
||||
void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) { |
||||
// 1x1 transpose is just copy
|
||||
if (M == 1 && N == 1) { *b = *a; return; } |
||||
|
||||
// get the interior 4x4 blocks, and the extra skirting
|
||||
const size_t Fblock = (major == 'R') ? N/4 : M/4; |
||||
const size_t Frem = (major == 'R') ? N%4 : M%4; |
||||
const size_t Sblock = (major == 'R') ? M/4 : N/4; |
||||
const size_t Srem = (major == 'R') ? M%4 : N%4; |
||||
|
||||
// if less than 4x4, invoke the block transpose immediately
|
||||
if (M < 4 && N < 4) { transposeBlock(Frem, Srem, a, lda, b, ldb); return; } |
||||
|
||||
// transpose 4x4 blocks
|
||||
const InputScalar* aptr = a; |
||||
OutputScalar* bptr = b; |
||||
for (size_t second = 0; second < Sblock; ++second) { |
||||
aptr = a + second*lda; |
||||
bptr = b + second; |
||||
for (size_t first = 0; first < Fblock; ++first) { |
||||
transposeBlock(4, 4, aptr, lda, bptr, ldb); |
||||
//transpose4x4(aptr, lda, bptr, ldb);
|
||||
aptr+=4; |
||||
bptr+=4*ldb; |
||||
} |
||||
// transpose trailing blocks on primary dimension
|
||||
transposeBlock(Frem, 4, aptr, lda, bptr, ldb); |
||||
} |
||||
// transpose trailing blocks on secondary dimension
|
||||
aptr = a + 4*Sblock*lda; |
||||
bptr = b + 4*Sblock; |
||||
for (size_t first = 0; first < Fblock; ++first) { |
||||
transposeBlock(4, Srem, aptr, lda, bptr, ldb); |
||||
aptr+=4; |
||||
bptr+=4*ldb; |
||||
} |
||||
// transpose bottom right-hand corner
|
||||
transposeBlock(Frem, Srem, aptr, lda, bptr, ldb); |
||||
} |
||||
|
||||
#ifdef __SSE2__ |
||||
/*
|
||||
* SSE2 supported fast copy, transpose and cast |
||||
*/ |
||||
#include <emmintrin.h> |
||||
|
||||
template <> |
||||
void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) { |
||||
__m128 row0, row1, row2, row3; |
||||
row0 = _mm_loadu_ps(src); |
||||
row1 = _mm_loadu_ps(src+lda); |
||||
row2 = _mm_loadu_ps(src+2*lda); |
||||
row3 = _mm_loadu_ps(src+3*lda); |
||||
_MM_TRANSPOSE4_PS(row0, row1, row2, row3); |
||||
_mm_storeu_ps(dst, row0); |
||||
_mm_storeu_ps(dst+ldb, row1); |
||||
_mm_storeu_ps(dst+2*ldb, row2); |
||||
_mm_storeu_ps(dst+3*ldb, row3); |
||||
} |
||||
|
||||
#endif |
||||
#endif |
@ -0,0 +1,48 @@ |
||||
#ifndef OPENCV_FILEBUFFER_HPP_ |
||||
#define OPENCV_FILEBUFFER_HPP_ |
||||
|
||||
#include <vector> |
||||
#include <streambuf> |
||||
|
||||
class EndianFileBuffer : public std::streambuf { |
||||
private: |
||||
const int fd_; |
||||
const size_t put_back_; |
||||
std::vector<char> buffer_; |
||||
|
||||
// prevent copy construction
|
||||
EndianFileBuffer(const EndianFileBuffer&); |
||||
EndianFileBuffer& operator=(const EndianFileBuffer&); |
||||
|
||||
public: |
||||
explicit EndianFileBuffer(int fd, size_t buffer_sz, size_t put_back) : |
||||
fd_(fd), put_back_(max(put_back, 1)), buffer_(max(buffer_sz, put_back_) + put_back_) { |
||||
char *end = &buffer_.front() + buffer_.size(); |
||||
setg(end, end, end); |
||||
} |
||||
|
||||
std::streambuf::int_type underflow() { |
||||
if (gptr() < egptr()) // buffer not exhausted
|
||||
return traits_type::to_int_type(*gptr()); |
||||
|
||||
char *base = &buffer_.front(); |
||||
char *start = base; |
||||
|
||||
if (eback() == base) { // true when this isn't the first fill
|
||||
std::memmove(base, egptr() - put_back_, put_back_); |
||||
start += put_back_; |
||||
} |
||||
|
||||
// start is now the start of the buffer
|
||||
// refill from the file
|
||||
read(fd_, start, buffer_.size() - (start - base)); |
||||
if (n == 0) return traits_type::eof(); |
||||
|
||||
// set buffer pointers
|
||||
setg(base, start, start + n); |
||||
return traits_type::to_int_type(*gptr()); |
||||
} |
||||
|
||||
}; |
||||
|
||||
#endif |
@ -0,0 +1,32 @@ |
||||
#include <ctime> |
||||
#include <stringstream> |
||||
|
||||
const char* day[] = { "Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat" }; |
||||
const char* month[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; |
||||
const char* arch = "${MEX_ARCH}" |
||||
|
||||
std::string formatCurrentTime() { |
||||
ostringstream oss; |
||||
time_t rawtime; |
||||
struct tm* timeinfo; |
||||
int dom, hour, min, sec, year; |
||||
// compute the current time
|
||||
time(&rawtime); |
||||
timeinfo = localtime(&rawtime); |
||||
// extract the components of interest
|
||||
dom = timeinfo->tm_mday; |
||||
hour = timeinfo->tm_hour; |
||||
min = timeinfo->tm_min; |
||||
sec = timeinfo->tm_sec; |
||||
year = timeinfo->year + 1900; |
||||
oss << day[timeinfo->tm_wday] << " " << month[timeinfo->tm_mon]
|
||||
<< " " << dom << " " << hour << ":" << min << ":" << sec << " " << year; |
||||
return oss.str(); |
||||
} |
||||
|
||||
void MatlabIO::whos() { |
||||
std::cout << "-------------------- whos --------------------" << std::endl; |
||||
std::cout << "Filename: " << filename() << std::endl; |
||||
std::cout << "File size: " << filesize() << "MB" << std::endl << std::endl; |
||||
std::cout << "Name size bytes type" << std::endl; |
||||
std::cout << "----------------------------------------------" << std::endl; |
@ -0,0 +1,81 @@ |
||||
#ifndef MATLAB_IO_HPP_ |
||||
#define MATLAB_IO_HPP_ |
||||
|
||||
#include <opencv2/core.hpp> |
||||
#include "map.hpp" |
||||
|
||||
namespace Matlab { |
||||
namespace IO { |
||||
static const int VERSION_5 = 5; |
||||
static const int VERSION_73 = 73; |
||||
} |
||||
|
||||
class Index { |
||||
private: |
||||
//! the name of the field (if associative container)
|
||||
std::string name_; |
||||
//! beginning of the data field in the file
|
||||
size_t begin_; |
||||
//! address after the last data field
|
||||
size_t end_; |
||||
//! Matlab stored-type
|
||||
int stored_type_; |
||||
//! Matlab actual type (sometimes compression is used)
|
||||
int type_; |
||||
//! is the field compressed?
|
||||
bool compressed_; |
||||
//! are the descendents associative (mappings)
|
||||
bool associative_; |
||||
//! the descendents of this node
|
||||
union { |
||||
//! valid if the container is a sequence (list)
|
||||
std::vector<Index> sequence_; |
||||
//! valid if the container is a mapping (associative)
|
||||
Map<std::string, Index> mapping_; |
||||
}; |
||||
|
||||
}; |
||||
|
||||
class MatlabIONode { |
||||
|
||||
}; |
||||
|
||||
class MatlabIO { |
||||
private: |
||||
// member variables
|
||||
static const int HEADER_LENGTH = 116; |
||||
static const int SUBSYS_LENGTH = 8; |
||||
static const int ENDIAN_LENGTH = 2; |
||||
char header_[HEADER_LENGTH+1]; |
||||
char subsys_[SUBSYS_LENGTH+1]; |
||||
char endian_[ENDIAN_LENGTH+1]; |
||||
int version_; |
||||
bool byte_swap_; |
||||
std::string filename_; |
||||
// uses a custom stream buffer for fast memory-mapped access and endian swapping
|
||||
std::fstream stream_; |
||||
//! the main file index. The top-level index must be associative
|
||||
Index index_; |
||||
|
||||
// internal methods
|
||||
void getFileHeader(); |
||||
void setFileHeader(); |
||||
|
||||
void getHeader(); |
||||
void setHeader(); |
||||
|
||||
public: |
||||
// construct/destruct
|
||||
MatlabIO() {} |
||||
~MatlabIO {} |
||||
|
||||
// global read and write routines
|
||||
std::string filename(void); |
||||
bool open(const std::string& filename, const std::string& mode); |
||||
|
||||
// index the contents of the file
|
||||
void index(); |
||||
|
||||
// print all of the top-level variables in the file
|
||||
} |
||||
#endif |
Loading…
Reference in new issue