diff --git a/modules/matlab/CMakeLists.txt b/modules/matlab/CMakeLists.txt
index f0258a10ad..ae35bcdd05 100644
--- a/modules/matlab/CMakeLists.txt
+++ b/modules/matlab/CMakeLists.txt
@@ -118,7 +118,7 @@ if (NOT MEX_WORKS)
                 ${CMAKE_CURRENT_SOURCE_DIR}/test/test_compiler.cpp
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/junk
         ERROR_VARIABLE MEX_ERROR
-        #OUTPUT_QUIET
+        OUTPUT_QUIET
     )
 
     if (MEX_ERROR)
diff --git a/modules/matlab/include/mxarray.hpp b/modules/matlab/include/mxarray.hpp
index 00543486a0..f34c2f6d38 100644
--- a/modules/matlab/include/mxarray.hpp
+++ b/modules/matlab/include/mxarray.hpp
@@ -2,6 +2,7 @@
 #define OPENCV_MXARRAY_HPP_
 
 #include "mex.h"
+#include "transpose.hpp"
 #include <vector>
 #include <string>
 #include <opencv2/core.hpp>
@@ -496,39 +497,6 @@ cv::Mat MxArray::toMat<Matlab::InheritType>() const {
 //                            MATRIX TRANSPOSE
 // ----------------------------------------------------------------------------
 
-template <typename InputScalar, typename OutputScalar>
-void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) {
-  switch (major) {
-  case 'R':
-    for (size_t m = 0; m < M; ++m) {
-      InputScalar const *       arow = a + m*lda;
-      InputScalar const * const aend = arow + N;
-      OutputScalar      *       bcol = b + m;
-      while (arow < aend) {
-        *bcol = *arow;
-        arow++;
-        bcol+=ldb;
-      }
-    }
-    return;
-  case 'C':
-    for (size_t n = 0; n < N; ++n) {
-      InputScalar const *       acol = a + n*lda;
-      InputScalar const * const aend = acol + M;
-      OutputScalar      *       brow = b + n;
-      while (acol < aend) {
-        *brow = *acol;
-        acol++;
-        brow+=ldb;
-      }
-    }
-    return;
-  default: 
-    error(std::string("Unknown ordering given: ").append(std::string(1,major)));
-  }
-}
-
-
 
 template <typename InputScalar, typename OutputScalar>
 void deepCopyAndTranspose(const cv::Mat& in, MxArray& out) {
diff --git a/modules/matlab/include/transpose.hpp b/modules/matlab/include/transpose.hpp
new file mode 100644
index 0000000000..2a820f43f9
--- /dev/null
+++ b/modules/matlab/include/transpose.hpp
@@ -0,0 +1,98 @@
+#ifndef OPENCV_TRANSPOSE_HPP_
+#define OPENCV_TRANSPOSE_HPP_
+
+template <typename InputScalar, typename OutputScalar>
+void transposeBlock(const size_t M, const size_t N, const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
+  InputScalar cache[16];
+  // copy the source into the cache contiguously
+  for (size_t n = 0; n < N; ++n)
+    for (size_t m = 0; m < M; ++m)
+      cache[m+n*4] = src[m+n*lda];
+  // copy the destination out of the cache contiguously
+  for (size_t m = 0; m < M; ++m)
+    for (size_t n = 0; n < N; ++n) 
+      dst[n+m*ldb] = cache[m+n*4];
+}
+
+template <typename InputScalar, typename OutputScalar>
+void transpose4x4(const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
+  InputScalar cache[16];
+  // copy the source into the cache contiguously
+  cache[0] = src[0];  cache[1] = src[1];  cache[2] = src[2];  cache[3] = src[3];  src+=lda;
+  cache[4] = src[0];  cache[5] = src[1];  cache[6] = src[2];  cache[7] = src[3];  src+=lda;
+  cache[8] = src[0];  cache[9] = src[1];  cache[10] = src[2]; cache[11] = src[3]; src+=lda;
+  cache[12] = src[0]; cache[13] = src[1]; cache[14] = src[2]; cache[15] = src[3]; src+=lda;
+  // copy the destination out of the contiguously
+  dst[0] = cache[0];  dst[1] = cache[4];  dst[2] = cache[8];   dst[3] = cache[12]; dst+=ldb;
+  dst[0] = cache[1];  dst[1] = cache[5];  dst[2] = cache[9];   dst[3] = cache[13]; dst+=ldb;
+  dst[0] = cache[2];  dst[1] = cache[6];  dst[2] = cache[10];  dst[3] = cache[14]; dst+=ldb;
+  dst[0] = cache[3];  dst[1] = cache[7];  dst[2] = cache[11];  dst[3] = cache[15]; dst+=ldb;
+}
+
+
+/*
+ * Vanilla copy, transpose and cast
+ */
+template <typename InputScalar, typename OutputScalar>
+void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) {
+  // 1x1 transpose is just copy
+  if (M == 1 && N == 1) { *b = *a; return; }
+
+  // get the interior 4x4 blocks, and the extra skirting
+  const size_t Fblock = (major == 'R') ? N/4 : M/4;
+  const size_t Frem   = (major == 'R') ? N%4 : M%4;
+  const size_t Sblock = (major == 'R') ? M/4 : N/4;
+  const size_t Srem   = (major == 'R') ? M%4 : N%4;
+
+  // if less than 4x4, invoke the block transpose immediately
+  if (M < 4 && N < 4) { transposeBlock(Frem, Srem, a, lda, b, ldb); return; }
+
+  // transpose 4x4 blocks
+  const InputScalar* aptr = a;
+  OutputScalar* bptr = b;
+  for (size_t second = 0; second < Sblock; ++second) {
+    aptr = a + second*lda;
+    bptr = b + second;
+    for (size_t first = 0; first < Fblock; ++first) {
+      transposeBlock(4, 4, aptr, lda, bptr, ldb);
+      //transpose4x4(aptr, lda, bptr, ldb);
+      aptr+=4;
+      bptr+=4*ldb;
+    }
+    // transpose trailing blocks on primary dimension
+    transposeBlock(Frem, 4, aptr, lda, bptr, ldb);
+  }
+  // transpose trailing blocks on secondary dimension
+  aptr = a + 4*Sblock*lda;
+  bptr = b + 4*Sblock;
+  for (size_t first = 0; first < Fblock; ++first) {
+    transposeBlock(4, Srem, aptr, lda, bptr, ldb);
+    aptr+=4;
+    bptr+=4*ldb;
+  }
+  // transpose bottom right-hand corner
+  transposeBlock(Frem, Srem, aptr, lda, bptr, ldb);
+}
+
+#ifdef __SSE2__
+/*
+ * SSE2 supported fast copy, transpose and cast
+ */
+#include <emmintrin.h>
+
+template <>
+void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) {
+  __m128 row0, row1, row2, row3;
+  row0 = _mm_loadu_ps(src);
+  row1 = _mm_loadu_ps(src+lda);
+  row2 = _mm_loadu_ps(src+2*lda);
+  row3 = _mm_loadu_ps(src+3*lda);
+  _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
+  _mm_storeu_ps(dst, row0);
+  _mm_storeu_ps(dst+ldb, row1);
+  _mm_storeu_ps(dst+2*ldb, row2);
+  _mm_storeu_ps(dst+3*ldb, row3);
+}
+
+#endif
+#endif
diff --git a/modules/matlab/io/FileBuffer.hpp b/modules/matlab/io/FileBuffer.hpp
new file mode 100644
index 0000000000..61acd48bdd
--- /dev/null
+++ b/modules/matlab/io/FileBuffer.hpp
@@ -0,0 +1,48 @@
+#ifndef OPENCV_FILEBUFFER_HPP_
+#define OPENCV_FILEBUFFER_HPP_
+
+#include <vector>
+#include <streambuf>
+
+class EndianFileBuffer : public std::streambuf {
+private:
+  const int fd_;
+  const size_t put_back_;
+  std::vector<char> buffer_;
+
+  // prevent copy construction
+  EndianFileBuffer(const EndianFileBuffer&);
+  EndianFileBuffer& operator=(const EndianFileBuffer&);
+
+public:
+  explicit EndianFileBuffer(int fd, size_t buffer_sz, size_t put_back) :
+      fd_(fd), put_back_(max(put_back, 1)), buffer_(max(buffer_sz, put_back_) + put_back_) {
+    char *end = &buffer_.front() + buffer_.size();
+    setg(end, end, end);
+  }
+
+  std::streambuf::int_type underflow() {
+    if (gptr() < egptr()) // buffer not exhausted
+      return traits_type::to_int_type(*gptr());
+
+    char *base  = &buffer_.front();
+    char *start = base;
+
+    if (eback() == base) { // true when this isn't the first fill
+      std::memmove(base, egptr() - put_back_, put_back_);
+      start += put_back_;
+    }
+
+    // start is now the start of the buffer
+    // refill from the file
+    read(fd_, start, buffer_.size() - (start - base));
+    if (n == 0) return traits_type::eof();
+
+    // set buffer pointers
+    setg(base, start, start + n);
+    return traits_type::to_int_type(*gptr());
+  }
+
+};
+
+#endif
diff --git a/modules/matlab/io/MatlabIO.cpp b/modules/matlab/io/MatlabIO.cpp
new file mode 100644
index 0000000000..3f61d940e3
--- /dev/null
+++ b/modules/matlab/io/MatlabIO.cpp
@@ -0,0 +1,32 @@
+#include <ctime>
+#include <stringstream>
+
+const char* day[]   = { "Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat" };
+const char* month[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
+const char* arch    = "${MEX_ARCH}"
+
+std::string formatCurrentTime() {
+  ostringstream oss;
+  time_t rawtime;
+  struct tm* timeinfo;
+  int dom, hour, min, sec, year;
+  // compute the current time
+  time(&rawtime);
+  timeinfo = localtime(&rawtime);
+  // extract the components of interest
+  dom  = timeinfo->tm_mday;
+  hour = timeinfo->tm_hour;
+  min  = timeinfo->tm_min;
+  sec  = timeinfo->tm_sec;
+  year = timeinfo->year + 1900;
+  oss << day[timeinfo->tm_wday] << " " << month[timeinfo->tm_mon] 
+      << " " << dom << " " << hour << ":" << min << ":" << sec << " " << year;
+  return oss.str();
+}
+
+void MatlabIO::whos() {
+  std::cout << "-------------------- whos --------------------" << std::endl;
+  std::cout << "Filename: " << filename() << std::endl;
+  std::cout << "File size: " << filesize() << "MB" << std::endl << std::endl;
+  std::cout << "Name        size         bytes         type" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
diff --git a/modules/matlab/io/MatlabIO.hpp b/modules/matlab/io/MatlabIO.hpp
new file mode 100644
index 0000000000..9723bfc86a
--- /dev/null
+++ b/modules/matlab/io/MatlabIO.hpp
@@ -0,0 +1,81 @@
+#ifndef MATLAB_IO_HPP_
+#define MATLAB_IO_HPP_
+
+#include <opencv2/core.hpp>
+#include "map.hpp"
+
+namespace Matlab {
+  namespace IO {
+    static const int VERSION_5  = 5;
+    static const int VERSION_73 = 73;
+  }
+
+class Index {
+private:
+  //! the name of the field (if associative container)
+  std::string name_;
+  //! beginning of the data field in the file
+  size_t begin_;
+  //! address after the last data field
+  size_t end_;
+  //! Matlab stored-type
+  int stored_type_;
+  //! Matlab actual type (sometimes compression is used)
+  int type_;
+  //! is the field compressed?
+  bool compressed_;
+  //! are the descendents associative (mappings)
+  bool associative_;
+  //! the descendents of this node
+  union {
+    //! valid if the container is a sequence (list)
+    std::vector<Index> sequence_;
+    //! valid if the container is a mapping (associative)
+    Map<std::string, Index> mapping_;
+  };
+
+};
+
+class MatlabIONode {
+
+};
+
+class MatlabIO {
+private:
+  // member variables
+  static const int HEADER_LENGTH = 116;
+  static const int SUBSYS_LENGTH = 8;
+  static const int ENDIAN_LENGTH = 2;
+  char header_[HEADER_LENGTH+1];
+  char subsys_[SUBSYS_LENGTH+1];
+  char endian_[ENDIAN_LENGTH+1];
+  int version_;
+  bool byte_swap_;
+  std::string filename_;
+  // uses a custom stream buffer for fast memory-mapped access and endian swapping
+  std::fstream stream_;
+  //! the main file index. The top-level index must be associative
+  Index index_;
+
+  // internal methods
+  void getFileHeader();
+  void setFileHeader();
+
+  void getHeader();
+  void setHeader();
+
+public:
+  // construct/destruct
+  MatlabIO() {}
+  ~MatlabIO {}
+
+  // global read and write routines
+  std::string filename(void);
+  bool open(const std::string& filename, const std::string& mode);
+
+  // index the contents of the file
+  void index();
+
+  // print all of the top-level variables in the file
+}
+#endif