diff --git a/.gitignore b/.gitignore
index 228365d9eb..039d2400a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ Thumbs.db
 tags
 tegra/
 bin/
-CMakeFiles/
 *.sdf
 *.opensdf
 *.obj
@@ -17,3 +16,9 @@ CMakeFiles/
 *.depend
 *.rule
 *.tmp
+*/debug
+*/CMakeFiles
+CMakeCache.txt
+*.suo
+*.log
+*.tlog
\ No newline at end of file
diff --git a/3rdparty/ffmpeg/build_win32.txt b/3rdparty/ffmpeg/build_win32.txt
new file mode 100644
index 0000000000..e98b285208
--- /dev/null
+++ b/3rdparty/ffmpeg/build_win32.txt
@@ -0,0 +1,42 @@
+The build script is to be fixed.
+Right now it assumes that 32-bit MinGW is in the system path and
+64-bit mingw is installed to c:\Apps\MinGW64.
+
+It is important that gcc is used, not g++!
+Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
+While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
+
+ffopencv.c is really a C++ source, hence -x c++ is used.
+
+How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
+
+1. Install 32-bit MinGW + MSYS from
+   http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
+   Let's assume, it's installed in C:\MSYS32.
+2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
+   Let's assume, it's installed in C:\MSYS64
+3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
+
+4. Now you have working MSYS32 and MSYS64 environments.
+   Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
+
+4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
+   Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
+
+5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
+
+   5.1. tar -xzf ffmpeg-x.y.z.tar.gz
+   5.2. mkdir build
+   5.3. cd build
+   5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
+   5.5. make
+   5.6. make install
+   5.7. cd /local/lib
+   5.8. strip -g *.a
+
+6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
+
+7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
+
+8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
+   (change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
diff --git a/3rdparty/ffmpeg/ffmpeg_version.cmake b/3rdparty/ffmpeg/ffmpeg_version.cmake
index a3c78b2fc1..48fba2b913 100644
--- a/3rdparty/ffmpeg/ffmpeg_version.cmake
+++ b/3rdparty/ffmpeg/ffmpeg_version.cmake
@@ -3,9 +3,11 @@ set(HAVE_FFMPEG_CODEC 1)
 set(HAVE_FFMPEG_FORMAT 1)
 set(HAVE_FFMPEG_UTIL 1)
 set(HAVE_FFMPEG_SWSCALE 1)
+set(HAVE_FFMPEG_RESAMPLE 0)
 set(HAVE_GENTOO_FFMPEG 1)
 
 set(ALIASOF_libavcodec_VERSION 55.18.102)
 set(ALIASOF_libavformat_VERSION 55.12.100)
 set(ALIASOF_libavutil_VERSION 52.38.100)
 set(ALIASOF_libswscale_VERSION 2.3.100)
+set(ALIASOF_libavresample_VERSION 1.0.1)
\ No newline at end of file
diff --git a/3rdparty/ffmpeg/license.txt b/3rdparty/ffmpeg/license.txt
new file mode 100644
index 0000000000..e018837a39
--- /dev/null
+++ b/3rdparty/ffmpeg/license.txt
@@ -0,0 +1,520 @@
+              Copyright (C) 2001 Fabrice Bellard
+
+    FFmpeg is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    FFmpeg is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with FFmpeg; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+==================================================================================
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/3rdparty/ffmpeg/readme.txt b/3rdparty/ffmpeg/readme.txt
index e98b285208..84faf7a83b 100644
--- a/3rdparty/ffmpeg/readme.txt
+++ b/3rdparty/ffmpeg/readme.txt
@@ -1,42 +1,32 @@
-The build script is to be fixed.
-Right now it assumes that 32-bit MinGW is in the system path and
-64-bit mingw is installed to c:\Apps\MinGW64.
-
-It is important that gcc is used, not g++!
-Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
-While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
-
-ffopencv.c is really a C++ source, hence -x c++ is used.
-
-How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
-
-1. Install 32-bit MinGW + MSYS from
-   http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
-   Let's assume, it's installed in C:\MSYS32.
-2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
-   Let's assume, it's installed in C:\MSYS64
-3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
-
-4. Now you have working MSYS32 and MSYS64 environments.
-   Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
-
-4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
-   Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
-
-5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
-
-   5.1. tar -xzf ffmpeg-x.y.z.tar.gz
-   5.2. mkdir build
-   5.3. cd build
-   5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
-   5.5. make
-   5.6. make install
-   5.7. cd /local/lib
-   5.8. strip -g *.a
-
-6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
-
-7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
-
-8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
-   (change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
+* On Linux and other Unix flavors OpenCV uses default or user-built ffmpeg/libav libraries.
+  If user builds ffmpeg/libav from source and wants OpenCV to stay BSD library, not GPL/LGPL,
+  he/she should use --enabled-shared configure flag and make sure that no GPL components are
+  enabled (some notable examples are x264 (H264 encoder) and libac3 (Dolby AC3 audio codec)).
+  See https://www.ffmpeg.org/legal.html for details.
+  
+  If you want to play very safe and do not want to use FFMPEG at all, regardless of whether it's installed on
+  your system or not, configure and build OpenCV using CMake with WITH_FFMPEG=OFF flag. OpenCV will then use
+  AVFoundation (OSX), GStreamer (Linux) or other available backends supported by opencv_videoio module.
+  
+  There is also our self-contained motion jpeg codec, which you can use without any worries.
+  It handles CV_FOURCC('M', 'J', 'P', 'G') streams within an AVI container (".avi").
+  
+* On Windows OpenCV uses pre-built ffmpeg binaries, built with proper flags (without GPL components) and
+  wrapped with simple, stable OpenCV-compatible API.
+  The binaries are opencv_ffmpeg.dll (version for 32-bit Windows) and
+  opencv_ffmpeg_64.dll (version for 64-bit Windows).
+  
+  See build_win32.txt for the build instructions, if you want to rebuild opencv_ffmpeg*.dll from scratch.
+
+  The pre-built opencv_ffmpeg*.dll is:
+  * LGPL library, not BSD libraries.
+  * Loaded at runtime by opencv_videoio module.
+    If it succeeds, ffmpeg can be used to decode/encode videos;
+    otherwise, other API is used.
+
+  If LGPL/GPL software can not be supplied with your OpenCV-based product, simply exclude
+  opencv_ffmpeg*.dll from your distribution; OpenCV will stay fully functional except for the ability to
+  decode/encode videos using FFMPEG (though, it may still be able to do that using other API,
+  such as Video for Windows, Windows Media Foundation or our self-contained motion jpeg codec).
+  
+  See license.txt for the FFMPEG copyright notice and the licensing terms.
diff --git a/3rdparty/libjasper/jas_cm.c b/3rdparty/libjasper/jas_cm.c
index dc23ead895..16d4a502df 100644
--- a/3rdparty/libjasper/jas_cm.c
+++ b/3rdparty/libjasper/jas_cm.c
@@ -842,7 +842,6 @@ static int jas_cmshapmat_apply(jas_cmpxform_t *pxform, jas_cmreal_t *in,
                 *dst++ = a2;
             }
         } else {
-assert(0);
             while (--cnt >= 0) {
                 a0 = *src++;
                 src++;
diff --git a/3rdparty/libjasper/jas_stream.c b/3rdparty/libjasper/jas_stream.c
index ca1239c7d9..3ba7a837db 100644
--- a/3rdparty/libjasper/jas_stream.c
+++ b/3rdparty/libjasper/jas_stream.c
@@ -345,6 +345,7 @@ jas_stream_t *jas_stream_tmpfile()
 {
     jas_stream_t *stream;
     jas_stream_fileobj_t *obj;
+    char *tmpname;
 
     if (!(stream = jas_stream_create())) {
         return 0;
@@ -365,10 +366,12 @@ jas_stream_t *jas_stream_tmpfile()
 
 #ifdef _WIN32
     /* Choose a file name. */
-    tmpnam(obj->pathname);
+    tmpname = tempnam(NULL, NULL);
+    strcpy(obj->pathname, tmpname);
+    free(tmpname);
 
     /* Open the underlying file. */
-    if ((obj->fd = open(obj->pathname, O_CREAT | O_EXCL | O_RDWR | O_TRUNC | O_BINARY,
+    if ((obj->fd = open(obj->pathname, O_CREAT | O_EXCL | O_RDWR | O_TRUNC | O_BINARY | O_TEMPORARY | _O_SHORT_LIVED,
       JAS_STREAM_PERMS)) < 0) {
         jas_stream_destroy(stream);
         return 0;
diff --git a/3rdparty/libjpeg/CMakeLists.txt b/3rdparty/libjpeg/CMakeLists.txt
index 65a9d1c8aa..d79f00adae 100644
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@@ -15,6 +15,13 @@ else()
   ocv_list_filterout(lib_srcs jmemnobs.c)
 endif()
 
+if(WINRT)
+    add_definitions(-DNO_GETENV)
+    get_directory_property( DirDefs COMPILE_DEFINITIONS )
+    message(STATUS "Adding NO_GETENV to compiler definitions for WINRT:")
+    message(STATUS "   COMPILE_DEFINITIONS = ${DirDefs}")
+endif()
+
 # ----------------------------------------------------------------------------------
 #         Define the library target:
 # ----------------------------------------------------------------------------------
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index ad8a466188..b7739e0e4e 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -17,7 +17,7 @@ check_include_file(string.h HAVE_STRING_H)
 check_include_file(sys/types.h HAVE_SYS_TYPES_H)
 check_include_file(unistd.h HAVE_UNISTD_H)
 
-if(WIN32)
+if(WIN32 AND NOT WINRT)
   set(USE_WIN32_FILEIO 1)
 endif()
 
@@ -79,7 +79,7 @@ set(lib_srcs
     "${CMAKE_CURRENT_BINARY_DIR}/tif_config.h"
     )
 
-if(WIN32)
+if(WIN32 AND NOT WINRT)
   list(APPEND lib_srcs tif_win32.c)
 else()
   list(APPEND lib_srcs tif_unix.c)
diff --git a/3rdparty/readme.txt b/3rdparty/readme.txt
index 64e2563a85..b067009028 100644
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@@ -6,41 +6,34 @@ In order to use these versions of libraries instead of system ones on UNIX syste
 should use BUILD_<library_name> CMake flags (for example, BUILD_PNG for the libpng library).
 
 ------------------------------------------------------------------------------------
-libjpeg 8d (8.4)  -   The Independent JPEG Group's JPEG software.
+libjpeg               The Independent JPEG Group's JPEG software.
                       Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
                       See IGJ home page http://www.ijg.org
                       for details and links to the source code
 
-                      HAVE_JPEG preprocessor flag must be set to make imgcodecs use libjpeg.
-                      On UNIX systems configure script takes care of it.
+                      WITH_JPEG CMake option must be ON to add libjpeg support to imgcodecs.
 ------------------------------------------------------------------------------------
-libpng 1.5.12     -   Portable Network Graphics library.
+libpng                Portable Network Graphics library.
                       Copyright (c) 2004, 2006-2012 Glenn Randers-Pehrson.
                       See libpng home page http://www.libpng.org
                       for details and links to the source code
 
-                      HAVE_PNG preprocessor flag must be set to make imgcodecs use libpng.
-                      On UNIX systems configure script takes care of it.
+                      WITH_PNG CMake option must be ON to add libpng support to imgcodecs.
 ------------------------------------------------------------------------------------
-libtiff 4.0.2     -   Tag Image File Format (TIFF) Software
+libtiff               Tag Image File Format (TIFF) Software
                       Copyright (c) 1988-1997 Sam Leffler
                       Copyright (c) 1991-1997 Silicon Graphics, Inc.
                       See libtiff home page http://www.remotesensing.org/libtiff/
                       for details and links to the source code
 
-                      HAVE_TIFF preprocessor flag must be set to make imgcodecs use libtiff.
-                      On UNIX systems configure script takes care of it.
-                      In this build support for ZIP (LZ77 compression) is turned on.
+                      WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
 ------------------------------------------------------------------------------------
-zlib 1.2.7        -   General purpose LZ77 compression library
+zlib                  General purpose LZ77 compression library
                       Copyright (C) 1995-2012 Jean-loup Gailly and Mark Adler.
                       See zlib home page http://www.zlib.net
                       for details and links to the source code
-
-                      No preprocessor definition is needed to make imgcodecs use this library -
-                      it is included automatically if either libpng or libtiff are used.
 ------------------------------------------------------------------------------------
-jasper-1.900.1    -   JasPer is a collection of software
+jasper                JasPer is a collection of software
                       (i.e., a library and application programs) for the coding
                       and manipulation of images.  This software can handle image data in a
                       variety of formats.  One such format supported by JasPer is the JPEG-2000
@@ -50,14 +43,9 @@ jasper-1.900.1    -   JasPer is a collection of software
                       Copyright (c) 1999-2000 The University of British Columbia
                       Copyright (c) 2001-2003 Michael David Adams
 
-                      The JasPer license can be found in src/libjasper.
-
-                      OpenCV on Windows uses pre-built libjasper library
-                      (lib/libjasper*). To get the latest source code,
-                      please, visit the project homepage:
-                      http://www.ece.uvic.ca/~mdadams/jasper/
+                      The JasPer license can be found in libjasper.
 ------------------------------------------------------------------------------------
-openexr-1.7.1     -   OpenEXR is a high dynamic-range (HDR) image file format developed
+openexr               OpenEXR is a high dynamic-range (HDR) image file format developed
                       by Industrial Light & Magic for use in computer imaging applications.
 
                       Copyright (c) 2006, Industrial Light & Magic, a division of Lucasfilm
@@ -66,11 +54,17 @@ openexr-1.7.1     -   OpenEXR is a high dynamic-range (HDR) image file format de
 
                       The project homepage: http://www.openexr.com
 ------------------------------------------------------------------------------------
-ffmpeg-0.8.0      -   FFmpeg is a complete, cross-platform solution to record,
+ffmpeg                FFmpeg is a complete, cross-platform solution to record,
                       convert and stream audio and video. It includes libavcodec -
                       the leading audio/video codec library, and also libavformat, libavutils and
-                      other helper libraries that are used by OpenCV (in highgui module) to
+                      other helper libraries that are used by OpenCV (in videoio module) to
                       read and write video files.
 
-                      The project homepage: http://ffmpeg.org/
+                      Copyright (c) 2001 Fabrice Bellard
+
+                      The project homepage: http://ffmpeg.org/.
+                      
+                      * On Linux/OSX we link user-installed ffmpeg (or ffmpeg fork libav).
+                      * On Windows we use pre-built ffmpeg binaries,
+                        see opencv/3rdparty/ffmpeg/readme.txt for details and licensing information
 ------------------------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 746faac20b..a02b71a8bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,11 @@ if(POLICY CMP0026)
   cmake_policy(SET CMP0026 OLD)
 endif()
 
+if (POLICY CMP0042)
+  # silence cmake 3.0+ warnings about MACOSX_RPATH
+  cmake_policy(SET CMP0042 OLD)
+endif()
+
 # must go before the project command
 set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
 if(DEFINED CMAKE_BUILD_TYPE)
@@ -926,6 +931,7 @@ if(DEFINED WITH_FFMPEG)
   status("      format:"       HAVE_FFMPEG_FORMAT  THEN "YES (ver ${ALIASOF_libavformat_VERSION})" ELSE NO)
   status("      util:"         HAVE_FFMPEG_UTIL    THEN "YES (ver ${ALIASOF_libavutil_VERSION})"   ELSE NO)
   status("      swscale:"      HAVE_FFMPEG_SWSCALE THEN "YES (ver ${ALIASOF_libswscale_VERSION})"  ELSE NO)
+  status("      resample:"     HAVE_FFMPEG_RESAMPLE THEN "YES (ver ${ALIASOF_libavresample_VERSION})"  ELSE NO)
   status("      gentoo-style:" HAVE_GENTOO_FFMPEG  THEN YES                                        ELSE NO)
 endif(DEFINED WITH_FFMPEG)
 
diff --git a/LICENSE b/LICENSE
index 5e32d88b47..ab58eebcad 100644
--- a/LICENSE
+++ b/LICENSE
@@ -7,6 +7,14 @@ copy or use the software.
                For Open Source Computer Vision Library
                        (3-clause BSD License)
 
+Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+Copyright (C) 2009-2015, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015, Itseez Inc., all rights reserved.
+Third party copyrights are property of their respective owners.
+
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
diff --git a/apps/annotation/CMakeLists.txt b/apps/annotation/CMakeLists.txt
index e14721ac6b..57b133df12 100644
--- a/apps/annotation/CMakeLists.txt
+++ b/apps/annotation/CMakeLists.txt
@@ -9,7 +9,7 @@ project(annotation)
 set(the_target opencv_annotation)
 
 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_ANNOTATION_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_ANNOTATION_DEPS})
 
 file(GLOB SRCS *.cpp)
 
diff --git a/apps/createsamples/CMakeLists.txt b/apps/createsamples/CMakeLists.txt
index 8acd288ac1..24506231e0 100644
--- a/apps/createsamples/CMakeLists.txt
+++ b/apps/createsamples/CMakeLists.txt
@@ -9,7 +9,7 @@ project(createsamples)
 set(the_target opencv_createsamples)
 
 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_CREATESAMPLES_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_CREATESAMPLES_DEPS})
 
 file(GLOB SRCS *.cpp)
 file(GLOB HDRS *.h*)
diff --git a/apps/traincascade/CMakeLists.txt b/apps/traincascade/CMakeLists.txt
index 78101c0bc5..b21fb87367 100644
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
@@ -9,7 +9,7 @@ project(traincascade)
 set(the_target opencv_traincascade)
 
 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_TRAINCASCADE_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_TRAINCASCADE_DEPS})
 
 file(GLOB SRCS *.cpp)
 file(GLOB HDRS *.h*)
diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index 5864022048..c2e7fb7d6a 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -437,7 +437,7 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
             if (is_buf_16u)
             {
                 unsigned short* udst_idx = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                    vi*sample_count + data_root->offset);
+                    (size_t)vi*sample_count + data_root->offset);
                 for( int i = 0; i < num_valid; i++ )
                 {
                     idx = src_idx[i];
@@ -450,7 +450,7 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
             else
             {
                 int* idst_idx = buf->data.i + root->buf_idx*get_length_subbuf() +
-                    vi*sample_count + root->offset;
+                    (size_t)vi*sample_count + root->offset;
                 for( int i = 0; i < num_valid; i++ )
                 {
                     idx = src_idx[i];
@@ -467,14 +467,14 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
         if (is_buf_16u)
         {
             unsigned short* udst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                (workVarCount-1)*sample_count + root->offset);
+                (size_t)(workVarCount-1)*sample_count + root->offset);
             for( int i = 0; i < count; i++ )
                 udst[i] = (unsigned short)src_lbls[sidx[i]];
         }
         else
         {
             int* idst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                (workVarCount-1)*sample_count + root->offset;
+                (size_t)(workVarCount-1)*sample_count + root->offset;
             for( int i = 0; i < count; i++ )
                 idst[i] = src_lbls[sidx[i]];
         }
@@ -484,14 +484,14 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
         if (is_buf_16u)
         {
             unsigned short* sample_idx_dst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset);
+                (size_t)workVarCount*sample_count + root->offset);
             for( int i = 0; i < count; i++ )
                 sample_idx_dst[i] = (unsigned short)sample_idx_src[sidx[i]];
         }
         else
         {
             int* sample_idx_dst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset;
+                (size_t)workVarCount*sample_count + root->offset;
             for( int i = 0; i < count; i++ )
                 sample_idx_dst[i] = sample_idx_src[sidx[i]];
         }
@@ -677,9 +677,9 @@ void CvCascadeBoostTrainData::setData( const CvFeatureEvaluator* _featureEvaluat
 
     // set sample labels
     if (is_buf_16u)
-        udst = (unsigned short*)(buf->data.s + work_var_count*sample_count);
+        udst = (unsigned short*)(buf->data.s + (size_t)work_var_count*sample_count);
     else
-        idst = buf->data.i + work_var_count*sample_count;
+        idst = buf->data.i + (size_t)work_var_count*sample_count;
 
     for (int si = 0; si < sample_count; si++)
     {
@@ -747,11 +747,11 @@ void CvCascadeBoostTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* o
     if ( vi < numPrecalcIdx )
     {
         if( !is_buf_16u )
-            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + vi*sample_count + n->offset;
+            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + (size_t)vi*sample_count + n->offset;
         else
         {
             const unsigned short* shortIndices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-                                                    vi*sample_count + n->offset );
+                                                    (size_t)vi*sample_count + n->offset );
             for( int i = 0; i < nodeSampleCount; i++ )
                 sortedIndicesBuf[i] = shortIndices[i];
 
@@ -862,14 +862,14 @@ struct FeatureIdxOnlyPrecalc : ParallelLoopBody
             {
                 valCachePtr[si] = (*featureEvaluator)( fi, si );
                 if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
+                    *(udst + (size_t)fi*sample_count + si) = (unsigned short)si;
                 else
-                    *(idst + fi*sample_count + si) = si;
+                    *(idst + (size_t)fi*sample_count + si) = si;
             }
             if ( is_buf_16u )
-                std::sort(udst + fi*sample_count, udst + (fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCachePtr) );
+                std::sort(udst + (size_t)fi*sample_count, udst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCachePtr) );
             else
-                std::sort(idst + fi*sample_count, idst + (fi + 1)*sample_count, LessThanIdx<float, int>(valCachePtr) );
+                std::sort(idst + (size_t)fi*sample_count, idst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, int>(valCachePtr) );
         }
     }
     const CvFeatureEvaluator* featureEvaluator;
@@ -898,14 +898,14 @@ struct FeatureValAndIdxPrecalc : ParallelLoopBody
             {
                 valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
                 if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
+                    *(udst + (size_t)fi*sample_count + si) = (unsigned short)si;
                 else
-                    *(idst + fi*sample_count + si) = si;
+                    *(idst + (size_t)fi*sample_count + si) = si;
             }
             if ( is_buf_16u )
-                std::sort(udst + fi*sample_count, udst + (fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCache->ptr<float>(fi)) );
+                std::sort(udst + (size_t)fi*sample_count, udst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCache->ptr<float>(fi)) );
             else
-                std::sort(idst + fi*sample_count, idst + (fi + 1)*sample_count, LessThanIdx<float, int>(valCache->ptr<float>(fi)) );
+                std::sort(idst + (size_t)fi*sample_count, idst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, int>(valCache->ptr<float>(fi)) );
         }
     }
     const CvFeatureEvaluator* featureEvaluator;
@@ -1228,9 +1228,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
     if (data->is_buf_16u)
     {
         unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset);
+            (size_t)(workVarCount-1)*scount + left->offset);
         unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset);
+            (size_t)(workVarCount-1)*scount + right->offset);
 
         for( int i = 0; i < n; i++ )
         {
@@ -1251,9 +1251,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
     else
     {
         int *ldst = buf->data.i + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset;
+            (size_t)(workVarCount-1)*scount + left->offset;
         int *rdst = buf->data.i + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset;
+            (size_t)(workVarCount-1)*scount + right->offset;
 
         for( int i = 0; i < n; i++ )
         {
@@ -1281,9 +1281,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
     if (data->is_buf_16u)
     {
         unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset);
+            (size_t)workVarCount*scount + left->offset);
         unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset);
+            (size_t)workVarCount*scount + right->offset);
         for (int i = 0; i < n; i++)
         {
             unsigned short idx = (unsigned short)tempBuf[i];
@@ -1302,9 +1302,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
     else
     {
         int* ldst = buf->data.i + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset;
+            (size_t)workVarCount*scount + left->offset;
         int* rdst = buf->data.i + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset;
+            (size_t)workVarCount*scount + right->offset;
         for (int i = 0; i < n; i++)
         {
             int idx = tempBuf[i];
@@ -1473,7 +1473,7 @@ void CvCascadeBoost::update_weights( CvBoostTree* tree )
         if (data->is_buf_16u)
         {
             unsigned short* labels = (unsigned short*)(buf->data.s + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count);
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
             for( int i = 0; i < n; i++ )
             {
                 // save original categorical responses {0,1}, convert them to {-1,1}
@@ -1491,7 +1491,7 @@ void CvCascadeBoost::update_weights( CvBoostTree* tree )
         else
         {
             int* labels = buf->data.i + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count;
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
 
             for( int i = 0; i < n; i++ )
             {
diff --git a/apps/traincascade/cascadeclassifier.cpp b/apps/traincascade/cascadeclassifier.cpp
index c9b524f5ef..8b3eb57ac5 100644
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@@ -135,7 +135,8 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
                                 const CvCascadeParams& _cascadeParams,
                                 const CvFeatureParams& _featureParams,
                                 const CvCascadeBoostParams& _stageParams,
-                                bool baseFormatSave )
+                                bool baseFormatSave,
+                                double acceptanceRatioBreakValue )
 {
     // Start recording clock ticks for training time output
     const clock_t begin_time = clock();
@@ -185,6 +186,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
     cout << "numStages: " << numStages << endl;
     cout << "precalcValBufSize[Mb] : " << _precalcValBufSize << endl;
     cout << "precalcIdxBufSize[Mb] : " << _precalcIdxBufSize << endl;
+    cout << "acceptanceRatioBreakValue : " << acceptanceRatioBreakValue << endl;
     cascadeParams.printAttrs();
     stageParams->printAttrs();
     featureParams->printAttrs();
@@ -207,13 +209,18 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
         if ( !updateTrainingSet( tempLeafFARate ) )
         {
             cout << "Train dataset for temp stage can not be filled. "
-                "Branch training terminated." << endl;
+                    "Branch training terminated." << endl;
             break;
         }
         if( tempLeafFARate <= requiredLeafFARate )
         {
             cout << "Required leaf false alarm rate achieved. "
-                 "Branch training terminated." << endl;
+                    "Branch training terminated." << endl;
+            break;
+        }
+        if( (tempLeafFARate <= acceptanceRatioBreakValue) && (acceptanceRatioBreakValue >= 0) ){
+            cout << "The required acceptanceRatio for the model has been reached to avoid overfitting of trainingdata. "
+                    "Branch training terminated." << endl;
             break;
         }
 
diff --git a/apps/traincascade/cascadeclassifier.h b/apps/traincascade/cascadeclassifier.h
index 6d6cb5b3f9..d8e044828b 100644
--- a/apps/traincascade/cascadeclassifier.h
+++ b/apps/traincascade/cascadeclassifier.h
@@ -94,7 +94,8 @@ public:
                 const CvCascadeParams& _cascadeParams,
                 const CvFeatureParams& _featureParams,
                 const CvCascadeBoostParams& _stageParams,
-                bool baseFormatSave = false );
+                bool baseFormatSave = false,
+                double acceptanceRatioBreakValue = -1.0 );
 private:
     int predict( int sampleIdx );
     void save( const std::string cascadeDirName, bool baseFormat = false );
diff --git a/apps/traincascade/old_ml_boost.cpp b/apps/traincascade/old_ml_boost.cpp
index be4cd81f04..fae3d60806 100644
--- a/apps/traincascade/old_ml_boost.cpp
+++ b/apps/traincascade/old_ml_boost.cpp
@@ -1200,7 +1200,7 @@ CvBoost::update_weights( CvBoostTree* tree )
         if (data->is_buf_16u)
         {
             unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count);
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
             for( i = 0; i < n; i++ )
             {
                 // save original categorical responses {0,1}, convert them to {-1,1}
@@ -1218,7 +1218,7 @@ CvBoost::update_weights( CvBoostTree* tree )
         else
         {
             int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count;
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
 
             for( i = 0; i < n; i++ )
             {
diff --git a/apps/traincascade/old_ml_inner_functions.cpp b/apps/traincascade/old_ml_inner_functions.cpp
index 10b43f93fe..68e78b1e57 100644
--- a/apps/traincascade/old_ml_inner_functions.cpp
+++ b/apps/traincascade/old_ml_inner_functions.cpp
@@ -82,7 +82,7 @@ void CvStatModel::load( const char* filename, const char* name )
 {
     CvFileStorage* fs = 0;
 
-    CV_FUNCNAME( "CvStatModel::load" );
+    CV_FUNCNAME( "CvAlgorithm::load" );
 
     __BEGIN__;
 
diff --git a/apps/traincascade/old_ml_tree.cpp b/apps/traincascade/old_ml_tree.cpp
index b7e346ccbc..d7c6511cfd 100644
--- a/apps/traincascade/old_ml_tree.cpp
+++ b/apps/traincascade/old_ml_tree.cpp
@@ -424,9 +424,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
             int* c_map;
 
             if (is_buf_16u)
-                udst = (unsigned short*)(buf->data.s + vi*sample_count);
+                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
             else
-                idst = buf->data.i + vi*sample_count;
+                idst = buf->data.i + (size_t)vi*sample_count;
 
             // copy data
             for( i = 0; i < sample_count; i++ )
@@ -540,9 +540,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
         else if( ci < 0 ) // process ordered variable
         {
             if (is_buf_16u)
-                udst = (unsigned short*)(buf->data.s + vi*sample_count);
+                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
             else
-                idst = buf->data.i + vi*sample_count;
+                idst = buf->data.i + (size_t)vi*sample_count;
 
             for( i = 0; i < sample_count; i++ )
             {
@@ -583,9 +583,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
 
     // set sample labels
     if (is_buf_16u)
-        udst = (unsigned short*)(buf->data.s + work_var_count*sample_count);
+        udst = (unsigned short*)(buf->data.s + (size_t)work_var_count*sample_count);
     else
-        idst = buf->data.i + work_var_count*sample_count;
+        idst = buf->data.i + (size_t)work_var_count*sample_count;
 
     for (i = 0; i < sample_count; i++)
     {
@@ -602,7 +602,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
 
         if (is_buf_16u)
         {
-            usdst = (unsigned short*)(buf->data.s + (get_work_var_count()-1)*sample_count);
+            usdst = (unsigned short*)(buf->data.s + (size_t)(get_work_var_count()-1)*sample_count);
             for( i = vi = 0; i < sample_count; i++ )
             {
                 usdst[i] = (unsigned short)vi++;
@@ -619,7 +619,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
         }
         else
         {
-            idst2 = buf->data.i + (get_work_var_count()-1)*sample_count;
+            idst2 = buf->data.i + (size_t)(get_work_var_count()-1)*sample_count;
             for( i = vi = 0; i < sample_count; i++ )
             {
                 idst2[i] = vi++;
@@ -785,7 +785,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                 if (is_buf_16u)
                 {
                     unsigned short* udst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset);
+                        (size_t)vi*sample_count + root->offset);
                     for( i = 0; i < count; i++ )
                     {
                         int val = src[sidx[i]];
@@ -796,7 +796,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                 else
                 {
                     int* idst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset;
+                        (size_t)vi*sample_count + root->offset;
                     for( i = 0; i < count; i++ )
                     {
                         int val = src[sidx[i]];
@@ -822,7 +822,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                 if (is_buf_16u)
                 {
                     unsigned short* udst_idx = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + data_root->offset);
+                        (size_t)vi*sample_count + data_root->offset);
                     for( i = 0; i < num_valid; i++ )
                     {
                         idx = src_idx[i];
@@ -846,7 +846,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                 else
                 {
                     int* idst_idx = buf->data.i + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset;
+                        (size_t)vi*sample_count + root->offset;
                     for( i = 0; i < num_valid; i++ )
                     {
                         idx = src_idx[i];
@@ -874,14 +874,14 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
         if (is_buf_16u)
         {
             unsigned short* sample_idx_dst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset);
+                (size_t)workVarCount*sample_count + root->offset);
             for (i = 0; i < count; i++)
                 sample_idx_dst[i] = (unsigned short)sample_idx_src[sidx[i]];
         }
         else
         {
             int* sample_idx_dst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset;
+                (size_t)workVarCount*sample_count + root->offset;
             for (i = 0; i < count; i++)
                 sample_idx_dst[i] = sample_idx_src[sidx[i]];
         }
@@ -1192,10 +1192,10 @@ void CvDTreeTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* ord_valu
 
     if( !is_buf_16u )
         *sorted_indices = buf->data.i + n->buf_idx*get_length_subbuf() +
-        vi*sample_count + n->offset;
+        (size_t)vi*sample_count + n->offset;
     else {
         const unsigned short* short_indices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset );
+            (size_t)vi*sample_count + n->offset );
         for( int i = 0; i < node_sample_count; i++ )
             sorted_indices_buf[i] = short_indices[i];
         *sorted_indices = sorted_indices_buf;
@@ -1266,10 +1266,10 @@ const int* CvDTreeTrainData::get_cat_var_data( CvDTreeNode* n, int vi, int* cat_
     const int* cat_values = 0;
     if( !is_buf_16u )
         cat_values = buf->data.i + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset;
+            (size_t)vi*sample_count + n->offset;
     else {
         const unsigned short* short_values = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset);
+            (size_t)vi*sample_count + n->offset);
         for( int i = 0; i < n->sample_count; i++ )
             cat_values_buf[i] = short_values[i];
         cat_values = cat_values_buf;
diff --git a/apps/traincascade/traincascade.cpp b/apps/traincascade/traincascade.cpp
index f77f30dec4..745e3054b5 100644
--- a/apps/traincascade/traincascade.cpp
+++ b/apps/traincascade/traincascade.cpp
@@ -15,6 +15,7 @@ int main( int argc, char* argv[] )
     int precalcValBufSize = 1024,
         precalcIdxBufSize = 1024;
     bool baseFormatSave = false;
+    double acceptanceRatioBreakValue = -1.0;
 
     CvCascadeParams cascadeParams;
     CvCascadeBoostParams stageParams;
@@ -36,6 +37,7 @@ int main( int argc, char* argv[] )
         cout << "  [-precalcIdxBufSize <precalculated_idxs_buffer_size_in_Mb = " << precalcIdxBufSize << ">]" << endl;
         cout << "  [-baseFormatSave]" << endl;
         cout << "  [-numThreads <max_number_of_threads = " << numThreads << ">]" << endl;
+        cout << "  [-acceptanceRatioBreakValue <value> = " << acceptanceRatioBreakValue << ">]" << endl;
         cascadeParams.printDefaults();
         stageParams.printDefaults();
         for( int fi = 0; fi < fc; fi++ )
@@ -86,6 +88,10 @@ int main( int argc, char* argv[] )
         {
           numThreads = atoi(argv[++i]);
         }
+        else if( !strcmp( argv[i], "-acceptanceRatioBreakValue" ) )
+        {
+          acceptanceRatioBreakValue = atof(argv[++i]);
+        }
         else if ( cascadeParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
         else if ( stageParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
         else if ( !set )
@@ -112,6 +118,7 @@ int main( int argc, char* argv[] )
                       cascadeParams,
                       *featureParams[cascadeParams.featureType],
                       stageParams,
-                      baseFormatSave );
+                      baseFormatSave,
+                      acceptanceRatioBreakValue );
     return 0;
 }
diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index ceaed5e3a3..5efd36c4ed 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -619,6 +619,8 @@ if(DEFINED CUDA_TARGET_CPU_ARCH)
   set(_cuda_target_cpu_arch_initial "${CUDA_TARGET_CPU_ARCH}")
 elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)")
   set(_cuda_target_cpu_arch_initial "ARM")
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64)")
+  set(_cuda_target_cpu_arch_initial "AARCH64")
 else()
   set(_cuda_target_cpu_arch_initial "")
 endif()
@@ -643,6 +645,12 @@ elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND "${CUDA_T
   elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
     set(_cuda_target_triplet_initial "armv7-linux-gnueabihf")
   endif()
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND "${CUDA_TARGET_CPU_ARCH}" STREQUAL "AARCH64")
+  if("${CUDA_TARGET_OS_VARIANT}" STREQUAL "Android" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-androideabi")
+    set(_cuda_target_triplet_initial "aarch64-linux-androideabi")
+  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-gnueabihf")
+    set(_cuda_target_triplet_initial "aarch64-linux-gnueabihf")
+  endif()
 endif()
 set(CUDA_TARGET_TRIPLET "${_cuda_target_triplet_initial}" CACHE STRING "Specify the target triplet for which the input files must be compiled.")
 file(GLOB __cuda_available_target_tiplets RELATIVE "${CUDA_TOOLKIT_ROOT_DIR}/targets" "${CUDA_TOOLKIT_ROOT_DIR}/targets/*" )
@@ -1094,8 +1102,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(nvcc_flags ${nvcc_flags} -m32)
   endif()
 
-  if(CUDA_TARGET_CPU_ARCH)
-    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  if(CUDA_TARGET_CPU_ARCH AND CUDA_VERSION VERSION_LESS "7.0")
+    # CPU architecture is either ARM or X86. Patch AARCH64 to be ARM
+    string(REPLACE "AARCH64" "ARM" CUDA_TARGET_CPU_ARCH_patched ${CUDA_TARGET_CPU_ARCH})
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH_patched}")
   endif()
 
   if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 13559b5c8a..6c235ebfbf 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -98,6 +98,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     add_extra_compiler_option(-pthread)
   endif()
 
+  if(CMAKE_COMPILER_IS_CLANGCXX)
+    add_extra_compiler_option(-Qunused-arguments)
+  endif()
+
   if(OPENCV_WARNINGS_ARE_ERRORS)
     add_extra_compiler_option(-Werror)
   endif()
@@ -127,6 +131,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   endif()
   if(ENABLE_SSE2)
     add_extra_compiler_option(-msse2)
+  elseif(X86 OR X86_64)
+    add_extra_compiler_option(-mno-sse2)
   endif()
   if(ENABLE_NEON)
     add_extra_compiler_option("-mfpu=neon")
@@ -139,6 +145,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(NOT MINGW)
     if(ENABLE_AVX)
       add_extra_compiler_option(-mavx)
+    elseif(X86 OR X86_64)
+      add_extra_compiler_option(-mno-avx)
     endif()
     if(ENABLE_AVX2)
       add_extra_compiler_option(-mavx2)
@@ -152,18 +160,26 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
       if(ENABLE_SSE3)
         add_extra_compiler_option(-msse3)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse3)
       endif()
 
       if(ENABLE_SSSE3)
         add_extra_compiler_option(-mssse3)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-ssse3)
       endif()
 
       if(ENABLE_SSE41)
         add_extra_compiler_option(-msse4.1)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse4.1)
       endif()
 
       if(ENABLE_SSE42)
         add_extra_compiler_option(-msse4.2)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse4.2)
       endif()
 
       if(ENABLE_POPCNT)
@@ -265,6 +281,11 @@ if(MSVC)
   endif()
 endif()
 
+if(MSVC12 AND NOT CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} /FS")
+  set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} /FS")
+endif()
+
 # Extra link libs if the user selects building static libs:
 if(NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX AND NOT ANDROID)
   # Android does not need these settings because they are already set by toolchain file
diff --git a/cmake/OpenCVConfig.cmake b/cmake/OpenCVConfig.cmake
index dfd7e8f268..09174b02fe 100644
--- a/cmake/OpenCVConfig.cmake
+++ b/cmake/OpenCVConfig.cmake
@@ -47,7 +47,7 @@ endif()
 
 if(NOT DEFINED OpenCV_STATIC)
   # look for global setting
-  if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+  if(BUILD_SHARED_LIBS)
     set(OpenCV_STATIC OFF)
   else()
     set(OpenCV_STATIC ON)
@@ -89,7 +89,7 @@ elseif(MINGW)
   execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
                   OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
+  if(OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
     set(MINGW64 1)
     set(OpenCV_ARCH x64)
   else()
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 72d939917a..c8484dca3f 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -91,9 +91,9 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)
 
   if(WIN32)
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
-              OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE
+              OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
               OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
+    if(OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
       set(MINGW64 1)
     endif()
   endif()
@@ -114,7 +114,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
 endif()
 
 
-# Similar code is existed in OpenCVConfig.cmake
+# Similar code exists in OpenCVConfig.cmake
 if(NOT DEFINED OpenCV_STATIC)
   # look for global setting
   if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
@@ -147,11 +147,7 @@ if(MSVC)
 elseif(MINGW)
   set(OpenCV_RUNTIME mingw)
 
-  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
-                  OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
-    set(MINGW64 1)
+  if(MINGW64)
     set(OpenCV_ARCH x64)
   else()
     set(OpenCV_ARCH x86)
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index 1443c62cad..b2d927957f 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -187,7 +187,7 @@ if(WITH_XIMEA)
 endif(WITH_XIMEA)
 
 # --- FFMPEG ---
-ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
+ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_FFMPEG_RESAMPLE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
 if(WITH_FFMPEG)
   if(WIN32 AND NOT ARM)
     include("${OpenCV_SOURCE_DIR}/3rdparty/ffmpeg/ffmpeg_version.cmake")
@@ -196,6 +196,7 @@ if(WITH_FFMPEG)
     CHECK_MODULE(libavformat HAVE_FFMPEG_FORMAT)
     CHECK_MODULE(libavutil HAVE_FFMPEG_UTIL)
     CHECK_MODULE(libswscale HAVE_FFMPEG_SWSCALE)
+    CHECK_MODULE(libavresample HAVE_FFMPEG_RESAMPLE)
 
     CHECK_INCLUDE_FILE(libavformat/avformat.h HAVE_GENTOO_FFMPEG)
     CHECK_INCLUDE_FILE(ffmpeg/avformat.h HAVE_FFMPEG_FFMPEG)
@@ -215,42 +216,43 @@ if(WITH_FFMPEG)
         # Do an other trial
         FIND_FILE(BZIP2_LIBRARIES NAMES libbz2.so.1 PATHS /lib)
       endif()
-    endif(HAVE_FFMPEG)
-  endif()
-
-  if(APPLE)
-    find_path(FFMPEG_INCLUDE_DIR "libavformat/avformat.h"
-              PATHS /usr/local /usr /opt
-              PATH_SUFFIXES include
-              DOC "The path to FFMPEG headers")
-    if(FFMPEG_INCLUDE_DIR)
-      set(HAVE_GENTOO_FFMPEG TRUE)
-      set(FFMPEG_LIB_DIR "${FFMPEG_INCLUDE_DIR}/../lib" CACHE PATH "Full path of FFMPEG library directory")
-      if(EXISTS "${FFMPEG_LIB_DIR}/libavcodec.a")
-        set(HAVE_FFMPEG_CODEC 1)
-        set(ALIASOF_libavcodec_VERSION "Unknown")
-        if(EXISTS "${FFMPEG_LIB_DIR}/libavformat.a")
-          set(HAVE_FFMPEG_FORMAT 1)
+    else()
+      find_path(FFMPEG_INCLUDE_DIR "libavformat/avformat.h"
+                PATHS /usr/local /usr /opt
+                PATH_SUFFIXES include
+                DOC "The path to FFMPEG headers")
+      if(FFMPEG_INCLUDE_DIR)
+        set(HAVE_GENTOO_FFMPEG TRUE)
+        set(FFMPEG_LIB_DIR "${FFMPEG_INCLUDE_DIR}/../lib" CACHE PATH "Full path of FFMPEG library directory")
+        find_library(FFMPEG_CODEC_LIB "avcodec" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_FORMAT_LIB "avformat" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_UTIL_LIB "avutil" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_SWSCALE_LIB "swscale" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_RESAMPLE_LIB "avresample" HINTS "${FFMPEG_LIB_DIR}")
+        if(FFMPEG_CODEC_LIB AND FFMPEG_FORMAT_LIB AND
+           FFMPEG_UTIL_LIB AND FFMPEG_SWSCALE_LIB)
+          set(ALIASOF_libavcodec_VERSION "Unknown")
           set(ALIASOF_libavformat_VERSION "Unknown")
-          if(EXISTS "${FFMPEG_LIB_DIR}/libavutil.a")
-            set(HAVE_FFMPEG_UTIL 1)
-            set(ALIASOF_libavutil_VERSION "Unknown")
-            if(EXISTS "${FFMPEG_LIB_DIR}/libswscale.a")
-              set(HAVE_FFMPEG_SWSCALE 1)
-              set(ALIASOF_libswscale_VERSION "Unknown")
-              set(HAVE_FFMPEG 1)
-            endif()
+          set(ALIASOF_libavutil_VERSION "Unknown")
+          set(ALIASOF_libswscale_VERSION "Unknown")
+          set(HAVE_FFMPEG 1)
+          if(FFMPEG_RESAMPLE_LIB)
+            set(HAVE_FFMPEG_RESAMPLE 1)
+            set(ALIASOF_libavresample_VERSION "Unknown")
           endif()
         endif()
-      endif()
-    endif(FFMPEG_INCLUDE_DIR)
-    if(HAVE_FFMPEG)
-      set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
-          "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
-          "${FFMPEG_LIB_DIR}/libswscale.a")
-      ocv_include_directories(${FFMPEG_INCLUDE_DIR})
+      endif(FFMPEG_INCLUDE_DIR)
+      if(HAVE_FFMPEG)
+        set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
+            "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
+            "${FFMPEG_LIB_DIR}/libswscale.a")
+        if(HAVE_FFMPEG_RESAMPLE)
+          set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavresample.a")
+        endif()
+        ocv_include_directories(${FFMPEG_INCLUDE_DIR})
+      endif(HAVE_FFMPEG)
     endif()
-  endif(APPLE)
+  endif()
 endif(WITH_FFMPEG)
 
 # --- VideoInput/DirectShow ---
diff --git a/cmake/OpenCVGenAndroidMK.cmake b/cmake/OpenCVGenAndroidMK.cmake
index 318c802ffe..d6f902b0f0 100644
--- a/cmake/OpenCVGenAndroidMK.cmake
+++ b/cmake/OpenCVGenAndroidMK.cmake
@@ -43,34 +43,17 @@ if(ANDROID)
   endforeach()
 
   # build the list of opencv libs and dependencies for all modules
-  set(OPENCV_MODULES_CONFIGMAKE "")
-  set(OPENCV_EXTRA_COMPONENTS_CONFIGMAKE "")
-  set(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE "")
-  foreach(m ${OPENCV_MODULES_PUBLIC})
-    list(INSERT OPENCV_MODULES_CONFIGMAKE 0 ${${m}_MODULE_DEPS_${ocv_optkind}} ${m})
-    if(${m}_EXTRA_DEPS_${ocv_optkind})
-      list(INSERT OPENCV_EXTRA_COMPONENTS_CONFIGMAKE 0 ${${m}_EXTRA_DEPS_${ocv_optkind}})
-    endif()
-  endforeach()
+  ocv_get_all_libs(OPENCV_MODULES_CONFIGMAKE OPENCV_EXTRA_COMPONENTS_CONFIGMAKE OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE)
 
-  # split 3rdparty libs and modules
-  foreach(mod ${OPENCV_MODULES_CONFIGMAKE})
-    if(NOT mod MATCHES "^opencv_.+$")
-      list(INSERT OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE 0 ${mod})
-    endif()
-  endforeach()
-  if(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE)
-    list(REMOVE_ITEM OPENCV_MODULES_CONFIGMAKE ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE})
-  endif()
+  # list -> string
+  string(REPLACE ";" " " OPENCV_MODULES_CONFIGMAKE "${OPENCV_MODULES_CONFIGMAKE}")
+  string(REPLACE ";" " " OPENCV_EXTRA_COMPONENTS_CONFIGMAKE "${OPENCV_EXTRA_COMPONENTS_CONFIGMAKE}")
+  string(REPLACE ";" " " OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE "${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE}")
 
-  # convert CMake lists to makefile literals
-  foreach(lst OPENCV_MODULES_CONFIGMAKE OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE OPENCV_EXTRA_COMPONENTS_CONFIGMAKE)
-    ocv_list_unique(${lst})
-    ocv_list_reverse(${lst})
-    string(REPLACE ";" " " ${lst} "${${lst}}")
-  endforeach()
+  # replace 'opencv_<module>' -> '<module>''
   string(REPLACE "opencv_" "" OPENCV_MODULES_CONFIGMAKE "${OPENCV_MODULES_CONFIGMAKE}")
 
+
   # prepare 3rd-party component list without TBB for armeabi and mips platforms. TBB is useless there.
   set(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE_NO_TBB ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE})
   foreach(mod ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE_NO_TBB})
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 955dcc4afd..d1558f4060 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -176,6 +176,11 @@ macro(ocv_add_module _name)
       endif()
     endif()
 
+    # add HAL as dependency
+    if(NOT "${the_module}" STREQUAL "opencv_hal")
+      ocv_add_dependencies(${the_module} opencv_hal)
+    endif()
+
     # add self to the world dependencies
     if((NOT DEFINED OPENCV_MODULE_IS_PART_OF_WORLD
         AND NOT OPENCV_MODULE_${the_module}_CLASS STREQUAL "BINDINGS"
@@ -517,6 +522,18 @@ macro(ocv_include_modules)
   endforeach()
 endmacro()
 
+# same as previous but with dependencies
+macro(ocv_include_modules_recurse)
+  ocv_include_modules(${ARGN})
+  foreach(d ${ARGN})
+    if(d MATCHES "^opencv_" AND HAVE_${d} AND DEFINED OPENCV_MODULE_${d}_DEPS)
+      foreach (sub ${OPENCV_MODULE_${d}_DEPS})
+        ocv_include_modules(${sub})
+      endforeach()
+    endif()
+  endforeach()
+endmacro()
+
 # setup include paths for the list of passed modules
 macro(ocv_target_include_modules target)
   foreach(d ${ARGN})
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 05576b3626..e7d60beb9b 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -415,31 +415,6 @@ function(status text)
 endfunction()
 
 
-# splits cmake libraries list of format "general;item1;debug;item2;release;item3" to two lists
-macro(ocv_split_libs_list lst lstdbg lstopt)
-  set(${lstdbg} "")
-  set(${lstopt} "")
-  set(perv_keyword "")
-  foreach(word ${${lst}})
-    if(word STREQUAL "debug" OR word STREQUAL "optimized")
-      set(perv_keyword ${word})
-    elseif(word STREQUAL "general")
-      set(perv_keyword "")
-    elseif(perv_keyword STREQUAL "debug")
-      list(APPEND ${lstdbg} "${word}")
-      set(perv_keyword "")
-    elseif(perv_keyword STREQUAL "optimized")
-      list(APPEND ${lstopt} "${word}")
-      set(perv_keyword "")
-    else()
-      list(APPEND ${lstdbg} "${word}")
-      list(APPEND ${lstopt} "${word}")
-      set(perv_keyword "")
-    endif()
-  endforeach()
-endmacro()
-
-
 # remove all matching elements from the list
 macro(ocv_list_filterout lst regex)
   foreach(item ${${lst}})
@@ -810,3 +785,35 @@ function(ocv_add_library target)
 
   _ocv_append_target_includes(${target})
 endfunction()
+
+# build the list of opencv libs and dependencies for all modules
+#  _modules - variable to hold list of all modules
+#  _extra - variable to hold list of extra dependencies
+#  _3rdparty - variable to hold list of prebuilt 3rdparty libraries
+macro(ocv_get_all_libs _modules _extra _3rdparty)
+  set(${_modules} "")
+  set(${_extra} "")
+  set(${_3rdparty} "")
+  foreach(m ${OPENCV_MODULES_PUBLIC})
+    get_target_property(deps ${m} INTERFACE_LINK_LIBRARIES)
+    list(INSERT ${_modules} 0 ${deps} ${m})
+    foreach (dep ${deps} ${OPENCV_LINKER_LIBS})
+      if (NOT DEFINED OPENCV_MODULE_${dep}_LOCATION)
+        if (TARGET ${dep})
+          list(INSERT ${_3rdparty} 0 ${dep})
+        else()
+          list(INSERT ${_extra} 0 ${dep})
+        endif()
+      endif()
+    endforeach()
+  endforeach()
+
+  # split 3rdparty libs and modules
+  list(REMOVE_ITEM ${_modules} ${${_3rdparty}} ${${_extra}})
+
+  # convert CMake lists to makefile literals
+  foreach(lst ${_modules} ${_3rdparty} ${_extra})
+    ocv_list_unique(${lst})
+    ocv_list_reverse(${lst})
+  endforeach()
+endmacro()
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index ee4a186430..a3cbbc08c7 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -77,6 +77,13 @@ if("@USE_IPPICV@" STREQUAL "TRUE") # value is defined by package builder (use ST
 endif()
 
 if(NOT TARGET opencv_core)
+  # Extract directory name from full path of the file currently being processed.
+  # Note that CMake 2.8.3 introduced CMAKE_CURRENT_LIST_DIR. We reimplement it
+  # for older versions of CMake to support these as well.
+  if(CMAKE_VERSION VERSION_LESS "2.8.3")
+    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  endif()
+
   include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
 endif()
 
@@ -388,6 +395,10 @@ macro(ocv_include_modules)
   include_directories(BEFORE "${OpenCV_INCLUDE_DIRS}")
 endmacro()
 
+macro(ocv_include_modules_recurse)
+  include_directories(BEFORE "${OpenCV_INCLUDE_DIRS}")
+endmacro()
+
 macro(ocv_target_link_libraries)
   target_link_libraries(${ARGN})
 endmacro()
diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in
index 6a7a6d8d7e..292d9b491b 100644
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@@ -21,6 +21,7 @@
 </libs>
 
 <skip_headers>
+    opencv2/hal/intrin*
     opencv2/core/cuda*
     opencv2/core/private*
     opencv/cxeigen.hpp
diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index 3643b6d3b2..fc1e74bbc3 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env python
 
 """gen_pattern.py
-To run:
--c 10 -r 12 -o out.svg
--T type of pattern, circles, acircles, checkerboard
--s --square_size size of squares in pattern
--u --units mm, inches, px, m
--w  page width in units
--h  page height in units
+Usage example:
+python gen_pattern.py -o out.svg -r 11 -c 8 -T circles -s 20.0 -R 5.0 -u mm -w 216 -h 279
+
+-o, --output - output file (default out.svg)
+-r, --rows - pattern rows (default 11)
+-c, --columns - pattern columns (default 8)
+-T, --type - type of pattern, circles, acircles, checkerboard (default circles)
+-s, --square_size - size of squares in pattern (default 20.0)
+-R, --radius_rate - circles_radius = square_size/radius_rate (default 5.0)
+-u, --units - mm, inches, px, m (default mm)
+-w, --page_width - page width in units (default 216)
+-h, --page_height - page height in units (default 279)
+-H, --help - show help
 """
 
 from svgfig import *
@@ -16,18 +22,20 @@ import sys
 import getopt
 
 class PatternMaker:
-  def __init__(self, cols,rows,output,units,square_size,page_width,page_height):
+  def __init__(self, cols,rows,output,units,square_size,radius_rate,page_width,page_height):
     self.cols = cols
     self.rows = rows
     self.output = output
     self.units = units
     self.square_size = square_size
+    self.radius_rate = radius_rate
     self.width = page_width
     self.height = page_height
     self.g = SVG("g") # the svg group container
+
   def makeCirclesPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0 #radius is a 5th of the spacing TODO parameterize
+    r = spacing / self.radius_rate
     for x in range(1,self.cols+1):
       for y in range(1,self.rows+1):
         dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
@@ -35,7 +43,7 @@ class PatternMaker:
 
   def makeACirclesPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0
+    r = spacing / self.radius_rate
     for i in range(0,self.rows):
       for j in range(0,self.cols):
         dot = SVG("circle", cx= ((j*2 + i%2)*spacing) + spacing, cy=self.height - (i * spacing + spacing), r=r, fill="black")
@@ -43,37 +51,23 @@ class PatternMaker:
 
   def makeCheckerboardPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0
     for x in range(1,self.cols+1):
       for y in range(1,self.rows+1):
-        #TODO make a checkerboard pattern
-        dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        self.g.append(dot)
+        if x%2 == y%2:
+          dot = SVG("rect", x=x * spacing, y=y * spacing, width=spacing, height=spacing, stroke_width="0", fill="black")
+          self.g.append(dot)
+
   def save(self):
     c = canvas(self.g,width="%d%s"%(self.width,self.units),height="%d%s"%(self.height,self.units),viewBox="0 0 %d %d"%(self.width,self.height))
     c.inkview(self.output)
 
-def makePattern(cols,rows,output,p_type,units,square_size,page_width,page_height):
-    width = page_width
-    spacing = square_size
-    height = page_height
-    r = spacing / 5.0
-    g = SVG("g") # the svg group container
-    for x in range(1,cols+1):
-      for y in range(1,rows+1):
-        if "circle" in p_type:
-          dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        g.append(dot)
-    c = canvas(g,width="%d%s"%(width,units),height="%d%s"%(height,units),viewBox="0 0 %d %d"%(width,height))
-    c.inkview(output)
-
 
 def main():
     # parse command line options, TODO use argparse for better doc
     try:
-        opts, args = getopt.getopt(sys.argv[1:], "ho:c:r:T:u:s:w:h:", ["help","output","columns","rows",
-                                                                      "type","units","square_size","page_width",
-                                                                      "page_height"])
+        opts, args = getopt.getopt(sys.argv[1:], "Ho:c:r:T:u:s:R:w:h:", ["help","output=","columns=","rows=",
+                                                                      "type=","units=","square_size=","radius_rate=",
+                                                                      "page_width=","page_height="])
     except getopt.error, msg:
         print msg
         print "for help use --help"
@@ -84,11 +78,12 @@ def main():
     p_type = "circles"
     units = "mm"
     square_size = 20.0
+    radius_rate = 5.0
     page_width = 216    #8.5 inches
     page_height = 279   #11 inches
     # process options
     for o, a in opts:
-        if o in ("-h", "--help"):
+        if o in ("-H", "--help"):
             print __doc__
             sys.exit(0)
         elif o in ("-r", "--rows"):
@@ -103,11 +98,13 @@ def main():
             units = a
         elif o in ("-s", "--square_size"):
             square_size = float(a)
+        elif o in ("-R", "--radius_rate"):
+            radius_rate = float(a)
         elif o in ("-w", "--page_width"):
             page_width = float(a)
         elif o in ("-h", "--page_height"):
             page_height = float(a)
-    pm = PatternMaker(columns,rows,output,units,square_size,page_width,page_height)
+    pm = PatternMaker(columns,rows,output,units,square_size,radius_rate,page_width,page_height)
     #dict for easy lookup of pattern type
     mp = {"circles":pm.makeCirclesPattern,"acircles":pm.makeACirclesPattern,"checkerboard":pm.makeCheckerboardPattern}
     mp[p_type]()
diff --git a/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown b/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
index 5ef3380159..7d9a1258a9 100644
--- a/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
+++ b/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
@@ -44,7 +44,7 @@ from matplotlib import pyplot as plt
 imgL = cv2.imread('tsukuba_l.png',0)
 imgR = cv2.imread('tsukuba_r.png',0)
 
-stereo = cv2.createStereoBM(numDisparities=16, blockSize=15)
+stereo = cv2.StereoBM_create(numDisparities=16, blockSize=15)
 disparity = stereo.compute(imgL,imgR)
 plt.imshow(disparity,'gray')
 plt.show()
diff --git a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
index 0b23643964..a7bd1f0597 100644
--- a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
+++ b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
@@ -30,7 +30,7 @@ y_{corrected} = y + [ p_1(r^2+ 2y^2)+ 2p_2xy]\f]
 So we have five distortion parameters which in OpenCV are presented as one row matrix with 5
 columns:
 
-\f[Distortion_{coefficients}=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]
+\f[distortion\_coefficients=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]
 
 Now for the unit conversion we use the following formula:
 
@@ -96,83 +96,30 @@ on how to do this you can find in the @ref tutorial_file_input_output_with_xml_y
 Explanation
 -----------
 
--#  **Read the settings.**
-    @code{.cpp}
-    Settings s;
-    const string inputSettingsFile = argc > 1 ? argv[1] : "default.xml";
-    FileStorage fs(inputSettingsFile, FileStorage::READ); // Read the settings
-    if (!fs.isOpened())
-    {
-          cout << "Could not open the configuration file: \"" << inputSettingsFile << "\"" << endl;
-          return -1;
-    }
-    fs["Settings"] >> s;
-    fs.release();                                         // close Settings file
-
-    if (!s.goodInput)
-    {
-          cout << "Invalid input detected. Application stopping. " << endl;
-          return -1;
-    }
-    @endcode
+-#  **Read the settings**
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp file_read
+
     For this I've used simple OpenCV class input operation. After reading the file I've an
     additional post-processing function that checks validity of the input. Only if all inputs are
     good then *goodInput* variable will be true.
 
--#  **Get next input, if it fails or we have enough of them - calibrate**. After this we have a big
+-#  **Get next input, if it fails or we have enough of them - calibrate**
+
+    After this we have a big
     loop where we do the following operations: get the next image from the image list, camera or
     video file. If this fails or we have enough images then we run the calibration process. In case
     of image we step out of the loop and otherwise the remaining frames will be undistorted (if the
     option is set) via changing from *DETECTION* mode to the *CALIBRATED* one.
-    @code{.cpp}
-    for(int i = 0;;++i)
-    {
-      Mat view;
-      bool blinkOutput = false;
-
-      view = s.nextImage();
-
-      //-----  If no more image, or got enough, then stop calibration and show result -------------
-      if( mode == CAPTURING && imagePoints.size() >= (unsigned)s.nrFrames )
-      {
-            if( runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints))
-                  mode = CALIBRATED;
-            else
-                  mode = DETECTION;
-      }
-      if(view.empty())          // If no more images then run calibration, save and stop loop.
-      {
-                if( imagePoints.size() > 0 )
-                      runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints);
-                break;
-      imageSize = view.size();  // Format input image.
-      if( s.flipVertical )    flip( view, view, 0 );
-      }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp get_input
     For some cameras we may need to flip the input image. Here we do this too.
 
--#  **Find the pattern in the current input**. The formation of the equations I mentioned above aims
+-#  **Find the pattern in the current input**
+
+    The formation of the equations I mentioned above aims
     to finding major patterns in the input: in case of the chessboard this are corners of the
     squares and for the circles, well, the circles themselves. The position of these will form the
     result which will be written into the *pointBuf* vector.
-    @code{.cpp}
-    vector<Point2f> pointBuf;
-
-    bool found;
-    switch( s.calibrationPattern ) // Find feature points on the input format
-    {
-    case Settings::CHESSBOARD:
-      found = findChessboardCorners( view, s.boardSize, pointBuf,
-      CALIB_CB_ADAPTIVE_THRESH | CALIB_CB_FAST_CHECK | CALIB_CB_NORMALIZE_IMAGE);
-      break;
-    case Settings::CIRCLES_GRID:
-      found = findCirclesGrid( view, s.boardSize, pointBuf );
-      break;
-    case Settings::ASYMMETRIC_CIRCLES_GRID:
-      found = findCirclesGrid( view, s.boardSize, pointBuf, CALIB_CB_ASYMMETRIC_GRID );
-      break;
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp find_pattern
     Depending on the type of the input pattern you use either the @ref cv::findChessboardCorners or
     the @ref cv::findCirclesGrid function. For both of them you pass the current image and the size
     of the board and you'll get the positions of the patterns. Furthermore, they return a boolean
@@ -188,109 +135,27 @@ Explanation
     *imagePoints* vector to collect all of the equations into a single container. Finally, for
     visualization feedback purposes we will draw the found points on the input image using @ref
     cv::findChessboardCorners function.
-    @code{.cpp}
-    if ( found)                // If done with success,
-      {
-          // improve the found corners' coordinate accuracy for chessboard
-            if( s.calibrationPattern == Settings::CHESSBOARD)
-            {
-                Mat viewGray;
-                cvtColor(view, viewGray, COLOR_BGR2GRAY);
-                cornerSubPix( viewGray, pointBuf, Size(11,11),
-                  Size(-1,-1), TermCriteria( TermCriteria::EPS+TermCriteria::MAX_ITER, 30, 0.1 ));
-            }
-
-            if( mode == CAPTURING &&  // For camera only take new samples after delay time
-                (!s.inputCapture.isOpened() || clock() - prevTimestamp > s.delay*1e-3*CLOCKS_PER_SEC) )
-            {
-                imagePoints.push_back(pointBuf);
-                prevTimestamp = clock();
-                blinkOutput = s.inputCapture.isOpened();
-            }
-
-            // Draw the corners.
-            drawChessboardCorners( view, s.boardSize, Mat(pointBuf), found );
-      }
-    @endcode
--#  **Show state and result to the user, plus command line control of the application**. This part
-    shows text output on the image.
-    @code{.cpp}
-    //----------------------------- Output Text ------------------------------------------------
-    string msg = (mode == CAPTURING) ? "100/100" :
-              mode == CALIBRATED ? "Calibrated" : "Press 'g' to start";
-    int baseLine = 0;
-    Size textSize = getTextSize(msg, 1, 1, 1, &baseLine);
-    Point textOrigin(view.cols - 2*textSize.width - 10, view.rows - 2*baseLine - 10);
-
-    if( mode == CAPTURING )
-    {
-      if(s.showUndistorsed)
-        msg = format( "%d/%d Undist", (int)imagePoints.size(), s.nrFrames );
-      else
-        msg = format( "%d/%d", (int)imagePoints.size(), s.nrFrames );
-    }
-
-    putText( view, msg, textOrigin, 1, 1, mode == CALIBRATED ?  GREEN : RED);
-
-    if( blinkOutput )
-       bitwise_not(view, view);
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp pattern_found
+-#  **Show state and result to the user, plus command line control of the application**
+
+    This part shows text output on the image.
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp output_text
     If we ran calibration and got camera's matrix with the distortion coefficients we may want to
     correct the image using @ref cv::undistort function:
-    @code{.cpp}
-    //------------------------- Video capture  output  undistorted ------------------------------
-    if( mode == CALIBRATED && s.showUndistorsed )
-    {
-      Mat temp = view.clone();
-      undistort(temp, view, cameraMatrix, distCoeffs);
-    }
-    //------------------------------ Show image and check for input commands -------------------
-    imshow("Image View", view);
-    @endcode
-    Then we wait for an input key and if this is *u* we toggle the distortion removal, if it is *g*
-    we start again the detection process, and finally for the *ESC* key we quit the application:
-    @code{.cpp}
-    char key =  waitKey(s.inputCapture.isOpened() ? 50 : s.delay);
-    if( key  == ESC_KEY )
-          break;
-
-    if( key == 'u' && mode == CALIBRATED )
-       s.showUndistorsed = !s.showUndistorsed;
-
-    if( s.inputCapture.isOpened() && key == 'g' )
-    {
-      mode = CAPTURING;
-      imagePoints.clear();
-    }
-    @endcode
--#  **Show the distortion removal for the images too**. When you work with an image list it is not
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp output_undistorted
+    Then we show the image and wait for an input key and if this is *u* we toggle the distortion removal,
+    if it is *g* we start again the detection process, and finally for the *ESC* key we quit the application:
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp await_input
+-#  **Show the distortion removal for the images too**
+
+    When you work with an image list it is not
     possible to remove the distortion inside the loop. Therefore, you must do this after the loop.
     Taking advantage of this now I'll expand the @ref cv::undistort function, which is in fact first
     calls @ref cv::initUndistortRectifyMap to find transformation matrices and then performs
     transformation using @ref cv::remap function. Because, after successful calibration map
     calculation needs to be done only once, by using this expanded form you may speed up your
     application:
-    @code{.cpp}
-    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed )
-    {
-      Mat view, rview, map1, map2;
-      initUndistortRectifyMap(cameraMatrix, distCoeffs, Mat(),
-          getOptimalNewCameraMatrix(cameraMatrix, distCoeffs, imageSize, 1, imageSize, 0),
-          imageSize, CV_16SC2, map1, map2);
-
-      for(int i = 0; i < (int)s.imageList.size(); i++ )
-      {
-          view = imread(s.imageList[i], 1);
-          if(view.empty())
-              continue;
-          remap(view, rview, map1, map2, INTER_LINEAR);
-          imshow("Image View", rview);
-          char c = waitKey();
-          if( c  == ESC_KEY || c == 'q' || c == 'Q' )
-              break;
-      }
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp show_results
 
 The calibration and save
 ------------------------
@@ -304,24 +169,7 @@ Therefore in the first function we just split up these two processes. Because we
 of the calibration variables we'll create these variables here and pass on both of them to the
 calibration and saving function. Again, I'll not show the saving part as that has little in common
 with the calibration. Explore the source file in order to find out how and what:
-@code{.cpp}
-bool runCalibrationAndSave(Settings& s, Size imageSize, Mat&  cameraMatrix, Mat& distCoeffs,vector<vector<Point2f> > imagePoints )
-{
- vector<Mat> rvecs, tvecs;
- vector<float> reprojErrs;
- double totalAvgErr = 0;
-
- bool ok = runCalibration(s,imageSize, cameraMatrix, distCoeffs, imagePoints, rvecs, tvecs,
-                          reprojErrs, totalAvgErr);
- cout << (ok ? "Calibration succeeded" : "Calibration failed")
-     << ". avg re projection error = "  << totalAvgErr ;
-
- if( ok )   // save only if the calibration was done with success
-     saveCameraParams( s, imageSize, cameraMatrix, distCoeffs, rvecs ,tvecs, reprojErrs,
-                         imagePoints, totalAvgErr);
- return ok;
-}
-@endcode
+@snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp run_and_save
 We do the calibration with the help of the @ref cv::calibrateCamera function. It has the following
 parameters:
 
@@ -331,29 +179,7 @@ parameters:
     present. Because, we use a single pattern for all the input images we can calculate this just
     once and multiply it for all the other input views. We calculate the corner points with the
     *calcBoardCornerPositions* function as:
-    @code{.cpp}
-    void calcBoardCornerPositions(Size boardSize, float squareSize, vector<Point3f>& corners,
-                      Settings::Pattern patternType /*= Settings::CHESSBOARD*/)
-    {
-    corners.clear();
-
-    switch(patternType)
-    {
-    case Settings::CHESSBOARD:
-    case Settings::CIRCLES_GRID:
-      for( int i = 0; i < boardSize.height; ++i )
-        for( int j = 0; j < boardSize.width; ++j )
-            corners.push_back(Point3f(float( j*squareSize ), float( i*squareSize ), 0));
-      break;
-
-    case Settings::ASYMMETRIC_CIRCLES_GRID:
-      for( int i = 0; i < boardSize.height; i++ )
-         for( int j = 0; j < boardSize.width; j++ )
-            corners.push_back(Point3f(float((2*j + i % 2)*squareSize), float(i*squareSize), 0));
-      break;
-    }
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp board_corners
     And then multiply it as:
     @code{.cpp}
     vector<vector<Point3f> > objectPoints(1);
@@ -365,12 +191,8 @@ parameters:
     circle pattern). We have already collected this from @ref cv::findChessboardCorners or @ref
     cv::findCirclesGrid function. We just need to pass it on.
 -   The size of the image acquired from the camera, video file or the images.
--   The camera matrix. If we used the fixed aspect ratio option we need to set the \f$f_x\f$ to zero:
-    @code{.cpp}
-    cameraMatrix = Mat::eye(3, 3, CV_64F);
-    if( s.flag & CALIB_FIX_ASPECT_RATIO )
-         cameraMatrix.at<double>(0,0) = 1.0;
-    @endcode
+-   The camera matrix. If we used the fixed aspect ratio option we need to set \f$f_x\f$:
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp fixed_aspect
 -   The distortion coefficient matrix. Initialize with zero.
     @code{.cpp}
     distCoeffs = Mat::zeros(8, 1, CV_64F);
@@ -393,33 +215,7 @@ double rms = calibrateCamera(objectPoints, imagePoints, imageSize, cameraMatrix,
     calculate the absolute norm between what we got with our transformation and the corner/circle
     finding algorithm. To find the average error we calculate the arithmetical mean of the errors
     calculated for all the calibration images.
-    @code{.cpp}
-    double computeReprojectionErrors( const vector<vector<Point3f> >& objectPoints,
-                              const vector<vector<Point2f> >& imagePoints,
-                              const vector<Mat>& rvecs, const vector<Mat>& tvecs,
-                              const Mat& cameraMatrix , const Mat& distCoeffs,
-                              vector<float>& perViewErrors)
-    {
-    vector<Point2f> imagePoints2;
-    int i, totalPoints = 0;
-    double totalErr = 0, err;
-    perViewErrors.resize(objectPoints.size());
-
-    for( i = 0; i < (int)objectPoints.size(); ++i )
-    {
-      projectPoints( Mat(objectPoints[i]), rvecs[i], tvecs[i], cameraMatrix,  // project
-                                           distCoeffs, imagePoints2);
-      err = norm(Mat(imagePoints[i]), Mat(imagePoints2), NORM_L2);              // difference
-
-      int n = (int)objectPoints[i].size();
-      perViewErrors[i] = (float) std::sqrt(err*err/n);                        // save for this view
-      totalErr        += err*err;                                             // sum it up
-      totalPoints     += n;
-    }
-
-    return std::sqrt(totalErr/totalPoints);              // calculate the arithmetical mean
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp compute_errors
 
 Results
 -------
@@ -461,20 +257,20 @@ the input. Here's, how a detected pattern should look:
 In both cases in the specified output XML/YAML file you'll find the camera and distortion
 coefficients matrices:
 @code{.xml}
-<Camera_Matrix type_id="opencv-matrix">
+<camera_matrix type_id="opencv-matrix">
 <rows>3</rows>
 <cols>3</cols>
 <dt>d</dt>
 <data>
  6.5746697944293521e+002 0. 3.1950000000000000e+002 0.
- 6.5746697944293521e+002 2.3950000000000000e+002 0. 0. 1.</data></Camera_Matrix>
-<Distortion_Coefficients type_id="opencv-matrix">
+ 6.5746697944293521e+002 2.3950000000000000e+002 0. 0. 1.</data></camera_matrix>
+<distortion_coefficients type_id="opencv-matrix">
 <rows>5</rows>
 <cols>1</cols>
 <dt>d</dt>
 <data>
  -4.1802327176423804e-001 5.0715244063187526e-001 0. 0.
- -5.7843597214487474e-001</data></Distortion_Coefficients>
+ -5.7843597214487474e-001</data></distortion_coefficients>
 @endcode
 Add these values as constants to your program, call the @ref cv::initUndistortRectifyMap and the
 @ref cv::remap function to remove distortion and enjoy distortion free inputs for cheap and low
diff --git a/doc/tutorials/core/adding_images/adding_images.markdown b/doc/tutorials/core/adding_images/adding_images.markdown
index b6ef7b7cd2..1565e7ed8b 100644
--- a/doc/tutorials/core/adding_images/adding_images.markdown
+++ b/doc/tutorials/core/adding_images/adding_images.markdown
@@ -22,7 +22,7 @@ From our previous tutorial, we know already a bit of *Pixel operators*. An inter
 \f[g(x) = (1 - \alpha)f_{0}(x) + \alpha f_{1}(x)\f]
 
 By varying \f$\alpha\f$ from \f$0 \rightarrow 1\f$ this operator can be used to perform a temporal
-*cross-disolve* between two images or videos, as seen in slide shows and film productions (cool,
+*cross-dissolve* between two images or videos, as seen in slide shows and film productions (cool,
 eh?)
 
 Code
diff --git a/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown b/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown
index f7888590c9..db1f774211 100644
--- a/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown
+++ b/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown
@@ -145,7 +145,7 @@ Explanation
             of size **(w/4.0, w/16.0)**
         -   The ellipse is rotated **angle** degrees
         -   The ellipse extends an arc between **0** and **360** degrees
-        -   The color of the figure will be **Scalar( 255, 255, 0)** which means blue in RGB value.
+        -   The color of the figure will be **Scalar( 255, 0, 0)** which means blue in RGB value.
         -   The ellipse's **thickness** is 2.
     -   *MyFilledCircle*
         @code{.cpp}
diff --git a/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown b/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown
index fa7dc07ee7..b9d39756b0 100644
--- a/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown
+++ b/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown
@@ -111,7 +111,7 @@ Explanation
         pt1.y = rng.uniform( y_1, y_2 );
         @endcode
         -   We know that **rng** is a *Random number generator* object. In the code above we are
-            calling **rng.uniform(a,b)**. This generates a radombly uniformed distribution between
+            calling **rng.uniform(a,b)**. This generates a randomly uniformed distribution between
             the values **a** and **b** (inclusive in **a**, exclusive in **b**).
         -   From the explanation above, we deduce that the extremes *pt1* and *pt2* will be random
             values, so the lines positions will be quite impredictable, giving a nice visual effect
@@ -133,7 +133,7 @@ Explanation
             are used as the *R*, *G* and *B* parameters for the line color. Hence, the color of the
             lines will be random too!
 
--#  The explanation above applies for the other functions generating circles, ellipses, polygones,
+-#  The explanation above applies for the other functions generating circles, ellipses, polygons,
     etc. The parameters such as *center* and *vertices* are also generated randomly.
 -#  Before finishing, we also should take a look at the functions *Display_Random_Text* and
     *Displaying_Big_End*, since they both have a few interesting features:
diff --git a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
index cc73fca1e0..9c651a6195 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@@ -55,7 +55,7 @@ Arranging the terms: \f$r = x \cos \theta + y \sin \theta\f$
 -#  We can do the same operation above for all the points in an image. If the curves of two
     different points intersect in the plane \f$\theta\f$ - \f$r\f$, that means that both points belong to a
     same line. For instance, following with the example above and drawing the plot for two more
-    points: \f$x_{1} = 9\f$, \f$y_{1} = 4\f$ and \f$x_{2} = 12\f$, \f$y_{2} = 3\f$, we get:
+    points: \f$x_{1} = 4\f$, \f$y_{1} = 9\f$ and \f$x_{2} = 12\f$, \f$y_{2} = 3\f$, we get:
 
     ![](images/Hough_Lines_Tutorial_Theory_2.jpg)
 
diff --git a/doc/tutorials/introduction/biicode/tutorial_biicode.markdown b/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
index bade3a6601..ce3d267231 100644
--- a/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
+++ b/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
@@ -41,7 +41,7 @@ Windows users also execute:
 $ bii cpp:configure -G "Visual Studio 12"
 @endcode
 
-Now execute ``bii cpp:build`` to build the project. **Note** that this can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine in your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.
+Now execute ``bii cpp:build`` to build the project. @note This can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine to your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.
 
 @code{.bash}
 $ bii cpp:build
@@ -137,7 +137,7 @@ replace with:
         diego/opencv(beta): 0
 @endcode
 
-**Note** that the first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point you can change back and forth between versions, just modifying your *biicode.conf requirements*.
+@note The first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point on you can change back and forth between versions just by modifying your *biicode.conf requirements*.
 
 Find the hooks and examples:
 * [OpenCV 2.4.10](http://www.biicode.com/diego/opencv)
diff --git a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
index 4f4adbed88..fd447307a7 100644
--- a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
+++ b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
@@ -53,9 +53,9 @@ Now you have to create your CMakeLists.txt file. It should look like this:
 cmake_minimum_required(VERSION 2.8)
 project( DisplayImage )
 find_package( OpenCV REQUIRED )
-include_directories( \f${OpenCV_INCLUDE_DIRS} )
+include_directories( ${OpenCV_INCLUDE_DIRS} )
 add_executable( DisplayImage DisplayImage.cpp )
-target_link_libraries( DisplayImage \f${OpenCV_LIBS} )
+target_link_libraries( DisplayImage ${OpenCV_LIBS} )
 @endcode
 ### Generate the executable
 
diff --git a/doc/user_guide/ug_traincascade.markdown b/doc/user_guide/ug_traincascade.markdown
index d35ec6f5f6..1bc7ff5f9a 100644
--- a/doc/user_guide/ug_traincascade.markdown
+++ b/doc/user_guide/ug_traincascade.markdown
@@ -256,6 +256,12 @@ Command line arguments of opencv_traincascade application grouped by purposes:
         Maximum number of threads to use during training. Notice that the actual number of used
         threads may be lower, depending on your machine and compilation options.
 
+    -   -acceptanceRatioBreakValue \<break_value\>
+
+        This argument is used to determine how precise your model should keep learning and when to stop.
+        A good guideline is to train not further than 10e-5, to ensure the model does not overtrain on your training data.
+        By default this value is set to -1 to disable this feature.
+
 -#  Cascade parameters:
 
     -   -stageType \<BOOST(default)\>
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 7b01a7bbcd..65cf557b4b 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -697,19 +697,19 @@ CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
 
 /** @brief Finds the camera intrinsic and extrinsic parameters from several views of a calibration pattern.
 
-@param objectPoints In the new interface it is a vector of vectors of calibration pattern points
-in the calibration pattern coordinate space. The outer vector contains as many elements as the
-number of the pattern views. If the same calibration pattern is shown in each view and it is fully
-visible, all the vectors will be the same. Although, it is possible to use partially occluded
-patterns, or even different patterns in different views. Then, the vectors will be different. The
-points are 3D, but since they are in a pattern coordinate system, then, if the rig is planar, it
-may make sense to put the model to a XY coordinate plane so that Z-coordinate of each input object
-point is 0.
+@param objectPoints In the new interface it is a vector of vectors of calibration pattern points in
+the calibration pattern coordinate space (e.g. std::vector<std::vector<cv::Vec3f>>). The outer
+vector contains as many elements as the number of the pattern views. If the same calibration pattern
+is shown in each view and it is fully visible, all the vectors will be the same. Although, it is
+possible to use partially occluded patterns, or even different patterns in different views. Then,
+the vectors will be different. The points are 3D, but since they are in a pattern coordinate system,
+then, if the rig is planar, it may make sense to put the model to a XY coordinate plane so that
+Z-coordinate of each input object point is 0.
 In the old interface all the vectors of object points from different views are concatenated
 together.
-@param imagePoints In the new interface it is a vector of vectors of the projections of
-calibration pattern points. imagePoints.size() and objectPoints.size() and imagePoints[i].size()
-must be equal to objectPoints[i].size() for each i.
+@param imagePoints In the new interface it is a vector of vectors of the projections of calibration
+pattern points (e.g. std::vector<std::vector<cv::Vec2f>>). imagePoints.size() and
+objectPoints.size() and imagePoints[i].size() must be equal to objectPoints[i].size() for each i.
 In the old interface all the vectors of object points from different views are concatenated
 together.
 @param imageSize Size of the image used only to initialize the intrinsic camera matrix.
@@ -719,11 +719,11 @@ and/or CV_CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy mu
 initialized before calling the function.
 @param distCoeffs Output vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6],[s_1, s_2, s_3, s_4]])\f$ of 4, 5, 8 or 12 elements.
-@param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
-That is, each k-th rotation vector together with the corresponding k-th translation vector (see
-the next output parameter description) brings the calibration pattern from the model coordinate
-space (in which object points are specified) to the world coordinate space, that is, a real
-position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
+@param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view
+(e.g. std::vector<cv::Mat>>). That is, each k-th rotation vector together with the corresponding
+k-th translation vector (see the next output parameter description) brings the calibration pattern
+from the model coordinate space (in which object points are specified) to the world coordinate
+space, that is, a real position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
 @param tvecs Output vector of translation vectors estimated for each pattern view.
 @param flags Different flags that may be zero or a combination of the following values:
 -   **CV_CALIB_USE_INTRINSIC_GUESS** cameraMatrix contains valid initial values of
@@ -1200,7 +1200,7 @@ for the other points. The array is computed only in the RANSAC and LMedS methods
 This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
 @cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:
 
-\f[[p_2; 1]^T K^T E K [p_1; 1] = 0 \\\f]\f[K =
+\f[[p_2; 1]^T K^{-T} E K^{-1} [p_1; 1] = 0 \\\f]\f[K =
 \begin{bmatrix}
 f & 0 & x_{pp}  \\
 0 & f & y_{pp}  \\
diff --git a/modules/calib3d/src/fundam.cpp b/modules/calib3d/src/fundam.cpp
index a97ed2c709..230182e8c9 100644
--- a/modules/calib3d/src/fundam.cpp
+++ b/modules/calib3d/src/fundam.cpp
@@ -641,7 +641,7 @@ static int run8Point( const Mat& _m1, const Mat& _m2, Mat& _fmatrix )
     W.at<double>(2) = 0.;
 
     // F0 <- U*diag([W(1), W(2), 0])*V'
-    gemm( U, Mat::diag(W), 1., 0, 0., TF, GEMM_1_T );
+    gemm( U, Mat::diag(W), 1., 0, 0., TF, 0 );
     gemm( TF, V, 1., 0, 0., F0, 0/*CV_GEMM_B_T*/ );
 
     // apply the transformation that is inverse
diff --git a/modules/calib3d/src/triangulate.cpp b/modules/calib3d/src/triangulate.cpp
index b0af3dc466..a19f96d10e 100644
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -63,8 +63,7 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
       !CV_IS_MAT(points4D) )
       CV_Error( CV_StsUnsupportedFormat, "Input parameters must be matrices" );
 
-    int numPoints;
-    numPoints = projPoints1->cols;
+    int numPoints = projPoints1->cols;
 
     if( numPoints < 1 )
         CV_Error( CV_StsOutOfRange, "Number of points must be more than zero" );
@@ -82,57 +81,39 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
        projMatr2->cols != 4 || projMatr2->rows != 3)
         CV_Error( CV_StsUnmatchedSizes, "Size of projection matrices must be 3x4" );
 
-    CvMat matrA;
-    double matrA_dat[24];
-    matrA = cvMat(6,4,CV_64F,matrA_dat);
+    // preallocate SVD matrices on stack
+    cv::Matx<double, 6, 4> matrA;
+    cv::Matx<double, 6, 4> matrU;
+    cv::Matx<double, 4, 1> matrW;
+    cv::Matx<double, 4, 4> matrV;
 
-    //CvMat matrU;
-    CvMat matrW;
-    CvMat matrV;
-    //double matrU_dat[9*9];
-    double matrW_dat[6*4];
-    double matrV_dat[4*4];
-
-    //matrU = cvMat(6,6,CV_64F,matrU_dat);
-    matrW = cvMat(6,4,CV_64F,matrW_dat);
-    matrV = cvMat(4,4,CV_64F,matrV_dat);
-
-    CvMat* projPoints[2];
-    CvMat* projMatrs[2];
-
-    projPoints[0] = projPoints1;
-    projPoints[1] = projPoints2;
-
-    projMatrs[0] = projMatr1;
-    projMatrs[1] = projMatr2;
+    CvMat* projPoints[2] = {projPoints1, projPoints2};
+    CvMat* projMatrs[2] = {projMatr1, projMatr2};
 
     /* Solve system for each point */
-    int i,j;
-    for( i = 0; i < numPoints; i++ )/* For each point */
+    for( int i = 0; i < numPoints; i++ )/* For each point */
     {
         /* Fill matrix for current point */
-        for( j = 0; j < 2; j++ )/* For each view */
+        for( int j = 0; j < 2; j++ )/* For each view */
         {
             double x,y;
             x = cvmGet(projPoints[j],0,i);
             y = cvmGet(projPoints[j],1,i);
             for( int k = 0; k < 4; k++ )
             {
-                cvmSet(&matrA, j*3+0, k, x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k) );
-                cvmSet(&matrA, j*3+1, k, y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k) );
-                cvmSet(&matrA, j*3+2, k, x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k) );
+                matrA(j*3+0, k) = x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k);
+                matrA(j*3+1, k) = y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k);
+                matrA(j*3+2, k) = x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k);
             }
         }
         /* Solve system for current point */
-        {
-            cvSVD(&matrA,&matrW,0,&matrV,CV_SVD_V_T);
+        cv::SVD::compute(matrA, matrW, matrU, matrV);
 
-            /* Copy computed point */
-            cvmSet(points4D,0,i,cvmGet(&matrV,3,0));/* X */
-            cvmSet(points4D,1,i,cvmGet(&matrV,3,1));/* Y */
-            cvmSet(points4D,2,i,cvmGet(&matrV,3,2));/* Z */
-            cvmSet(points4D,3,i,cvmGet(&matrV,3,3));/* W */
-        }
+        /* Copy computed point */
+        cvmSet(points4D,0,i,matrV(3,0));/* X */
+        cvmSet(points4D,1,i,matrV(3,1));/* Y */
+        cvmSet(points4D,2,i,matrV(3,2));/* Z */
+        cvmSet(points4D,3,i,matrV(3,3));/* W */
     }
 
 #if 0
diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index 553b81c39b..d4212e94fc 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties)
     EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10);
     EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10);
     EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
-    CV_Assert(abs(rms - 0.263782587133546) < 1e-10);
+    CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
     CV_Assert(errors.alpha == 0);
 }
 
diff --git a/modules/calib3d/test/test_fundam.cpp b/modules/calib3d/test/test_fundam.cpp
index 7eb12ad24b..5f8d30de40 100644
--- a/modules/calib3d/test/test_fundam.cpp
+++ b/modules/calib3d/test/test_fundam.cpp
@@ -973,26 +973,12 @@ int CV_FundamentalMatTest::prepare_test_case( int test_case_idx )
     return code;
 }
 
-
 void CV_FundamentalMatTest::run_func()
 {
-    //if(!test_cpp)
-    {
-        CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
-        CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
-        f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
-    }
-    /*else
-    {
-        cv::findFundamentalMat(const Mat& points1, const Mat& points2,
-        vector<uchar>& mask, int method=FM_RANSAC,
-        double param1=3., double param2=0.99 );
-
-        CV_EXPORTS Mat findFundamentalMat( const Mat& points1, const Mat& points2,
-                                          int method=FM_RANSAC,
-                                          double param1=3., double param2=0.99 );
-    }*/
-
+    // cvFindFundamentalMat calls cv::findFundamentalMat
+    CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
+    CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
+    f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
 }
 
 
@@ -1022,7 +1008,7 @@ void CV_FundamentalMatTest::prepare_to_validation( int test_case_idx )
     F0 *= 1./f0[8];
 
     uchar* status = test_mat[TEMP][1].ptr();
-    double err_level = method <= CV_FM_8POINT ? 1 : get_success_error_level( test_case_idx, OUTPUT, 1 );
+    double err_level = get_success_error_level( test_case_idx, OUTPUT, 1 );
     uchar* mtfm1 = test_mat[REF_OUTPUT][1].ptr();
     uchar* mtfm2 = test_mat[OUTPUT][1].ptr();
     double* f_prop1 = test_mat[REF_OUTPUT][0].ptr<double>();
diff --git a/modules/calib3d/test/test_reproject_image_to_3d.cpp b/modules/calib3d/test/test_reproject_image_to_3d.cpp
index 3e77a290ce..7364d3bf46 100644
--- a/modules/calib3d/test/test_reproject_image_to_3d.cpp
+++ b/modules/calib3d/test/test_reproject_image_to_3d.cpp
@@ -138,7 +138,12 @@ protected:
             {
                 InT d = disp(y, x);
 
-                double from[4] = { x, y, d, 1 };
+                double from[4] = {
+                    static_cast<double>(x),
+                    static_cast<double>(y),
+                    static_cast<double>(d),
+                    1.0,
+                };
                 Mat_<double> res = Q * Mat_<double>(4, 1, from);
                 res /= res(3, 0);
 
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index c8d8735b8e..76a3966bba 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -183,6 +183,9 @@ protected:
                         method, totalTestsCount - successfulTestsCount, totalTestsCount, maxError, mode);
                     ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
                 }
+                cout << "mode: " << mode << ", method: " << method << " -> "
+                     << ((double)successfulTestsCount / totalTestsCount) * 100 << "%"
+                     << " (err < " << maxError << ")" << endl;
             }
         }
     }
diff --git a/modules/calib3d/test/test_undistort_badarg.cpp b/modules/calib3d/test/test_undistort_badarg.cpp
index f3f762fa65..cfce8a49c6 100644
--- a/modules/calib3d/test/test_undistort_badarg.cpp
+++ b/modules/calib3d/test/test_undistort_badarg.cpp
@@ -104,7 +104,10 @@ void CV_UndistortPointsBadArgTest::run(int)
     img_size.height = 600;
     double cam[9] = {150.f, 0.f, img_size.width/2.f, 0, 300.f, img_size.height/2.f, 0.f, 0.f, 1.f};
     double dist[4] = {0.01,0.02,0.001,0.0005};
-    double s_points[N_POINTS2] = {img_size.width/4,img_size.height/4};
+    double s_points[N_POINTS2] = {
+        static_cast<double>(img_size.width) / 4.0,
+        static_cast<double>(img_size.height) / 4.0,
+    };
     double d_points[N_POINTS2];
     double p[9] = {155.f, 0.f, img_size.width/2.f+img_size.width/50.f, 0, 310.f, img_size.height/2.f+img_size.height/50.f, 0.f, 0.f, 1.f};
     double r[9] = {1,0,0,0,1,0,0,0,1};
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 2a51de22e5..e4c61e43ad 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -10,8 +10,10 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -2449,9 +2451,7 @@ matrix. The Singular Value Decomposition is used to solve least-square
 problems, under-determined linear systems, invert matrices, compute
 condition numbers, and so on.
 
-For a faster operation, you can pass flags=SVD::MODIFY_A|... to modify
-the decomposed matrix when it is not necessary to preserve it. If you
-want to compute a condition number of a matrix or an absolute value of
+If you want to compute a condition number of a matrix or an absolute value of
 its determinant, you do not need `u` and `vt`. You can pass
 flags=SVD::NO_UV|... . Another flag SVD::FULL_UV indicates that full-size u
 and vt must be computed, which is not necessary most of the time.
@@ -2462,8 +2462,8 @@ class CV_EXPORTS SVD
 {
 public:
     enum Flags {
-        /** use the algorithm to modify the decomposed matrix; it can save space and speed up
-            processing */
+        /** allow the algorithm to modify the decomposed matrix; it can save space and speed up
+            processing. currently ignored. */
         MODIFY_A = 1,
         /** indicates that only a vector of singular values `w` is to be processed, while u and vt
             will be set to empty matrices */
@@ -2921,6 +2921,10 @@ public:
     Algorithm();
     virtual ~Algorithm();
 
+    /** @brief Clears the algorithm state
+    */
+    CV_WRAP virtual void clear() {}
+
     /** @brief Stores algorithm parameters in a file storage
     */
     virtual void write(FileStorage& fs) const { (void)fs; }
@@ -2928,6 +2932,75 @@ public:
     /** @brief Reads algorithm parameters from a file storage
     */
     virtual void read(const FileNode& fn) { (void)fn; }
+
+    /** @brief Returns true if the Algorithm is empty (e.g. in the very beginning or after unsuccessful read
+     */
+    virtual bool empty() const { return false; }
+
+    /** @brief Reads algorithm from the file node
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::read<SVM>(fn);
+     @endcode
+     In order to make this method work, the derived class must overwrite Algorithm::read(const
+     FileNode& fn) and also have static create() method without parameters
+     (or with all the optional parameters)
+     */
+    template<typename _Tp> static Ptr<_Tp> read(const FileNode& fn)
+    {
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from the file
+
+     @param filename Name of the file to read.
+     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
+     @endcode
+     In order to make this method work, the derived class must overwrite Algorithm::read(const
+     FileNode& fn).
+     */
+    template<typename _Tp> static Ptr<_Tp> load(const String& filename, const String& objname=String())
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from a String
+
+     @param strModel The string variable containing the model you want to load.
+     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
+     @endcode
+     */
+    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel, const String& objname=String())
+    {
+        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** Saves the algorithm to a file.
+     In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
+    CV_WRAP virtual void save(const String& filename) const;
+
+    /** Returns the algorithm string identifier.
+     This string is used as top level xml/yml node tag when the object is saved to a file or string. */
+    CV_WRAP virtual String getDefaultName() const;
 };
 
 struct Param {
diff --git a/modules/core/include/opencv2/core/affine.hpp b/modules/core/include/opencv2/core/affine.hpp
index f8e84b97ad..3b527cd221 100644
--- a/modules/core/include/opencv2/core/affine.hpp
+++ b/modules/core/include/opencv2/core/affine.hpp
@@ -253,7 +253,7 @@ void cv::Affine3<T>::rotation(const Vec3& _rvec)
         double c = std::cos(theta);
         double s = std::sin(theta);
         double c1 = 1. - c;
-        double itheta = theta ? 1./theta : 0.;
+        double itheta = (theta != 0) ? 1./theta : 0.;
 
         rx *= itheta; ry *= itheta; rz *= itheta;
 
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 73beb911fa..e4efe0fb9b 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -53,6 +53,7 @@
 
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
+#include "opencv2/hal.hpp"
 
 namespace cv
 {
@@ -400,140 +401,30 @@ configurations while CV_DbgAssert is only retained in the Debug configuration.
 #  define CV_DbgAssert(expr)
 #endif
 
-
-/////////////// saturate_cast (used in image & signal processing) ///////////////////
-
-/**
-Template function for accurate conversion from one primitive type to another.
-
-The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
-and others. They perform an efficient and accurate conversion from one primitive type to another
-(see the introduction chapter). saturate in the name means that when the input value v is out of the
-range of the target type, the result is not formed just by taking low bits of the input, but instead
-the value is clipped. For example:
-@code
-    uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
-    short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
-@endcode
-Such clipping is done when the target type is unsigned char , signed char , unsigned short or
-signed short . For 32-bit integers, no clipping is done.
-
-When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
-the floating-point value is first rounded to the nearest integer and then clipped if needed (when
-the target type is 8- or 16-bit).
-
-This operation is used in the simplest or most complex image processing functions in OpenCV.
-
-@param v Function parameter.
-@sa add, subtract, multiply, divide, Mat::convertTo
-*/
-template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-
-//! @cond IGNORED
-
-template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
-template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
-template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
-
-template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
-template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
-
-template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
-template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
-
-template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
-template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
-template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
-
-template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
-template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
-
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
-//! @endcond
-
-//////////////////////////////// low-level functions ////////////////////////////////
-
-CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-
-CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
-CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
-CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
-CV_EXPORTS float normL1_(const float* a, const float* b, int n);
-CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
-
-CV_EXPORTS void exp(const float* src, float* dst, int n);
-CV_EXPORTS void log(const float* src, float* dst, int n);
-
-CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
-CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
-The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
-NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
-single-precision data.
-@param val A function argument.
+/*
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
  */
-CV_EXPORTS_W float cubeRoot(float val);
+struct CV_EXPORTS Hamming
+{
+    enum { normType = NORM_HAMMING };
+    typedef unsigned char ValueType;
+    typedef int ResultType;
 
-/** @brief Calculates the angle of a 2D vector in degrees.
+    /** this will count the bits in a ^ b
+     */
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
+};
 
-The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
-in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
-@param x x-coordinate of the vector.
-@param y y-coordinate of the vector.
- */
-CV_EXPORTS_W float fastAtan2(float y, float x);
+typedef Hamming HammingLUT;
 
 /////////////////////////////////// inline norms ////////////////////////////////////
 
+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }
 
 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
@@ -563,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4 )
     {
-        s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) +
-            (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]);
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
     }
 #endif
     for( ; i < n; i++ )
-        s += std::abs(a[i]);
+        s += cv_abs(a[i]);
     return s;
 }
 
@@ -577,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
 {
     _AccTp s = 0;
     for( int i = 0; i < n; i++ )
-        s = std::max(s, (_AccTp)std::abs(a[i]));
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
     return s;
 }
 
@@ -601,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
-template<> inline
-float normL2Sqr(const float* a, const float* b, int n)
+inline float normL2Sqr(const float* a, const float* b, int n)
 {
     if( n >= 8 )
-        return normL2Sqr_(a, b, n);
+        return hal::normL2Sqr_(a, b, n);
     float s = 0;
     for( int i = 0; i < n; i++ )
     {
@@ -635,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
-template<> inline
-float normL1(const float* a, const float* b, int n)
+inline float normL1(const float* a, const float* b, int n)
 {
     if( n >= 8 )
-        return normL1_(a, b, n);
+        return hal::normL1_(a, b, n);
     float s = 0;
     for( int i = 0; i < n; i++ )
     {
@@ -649,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
     return s;
 }
 
-template<> inline
-int normL1(const uchar* a, const uchar* b, int n)
+inline int normL1(const uchar* a, const uchar* b, int n)
 {
-    return normL1_(a, b, n);
+    return hal::normL1_(a, b, n);
 }
 
 template<typename _Tp, typename _AccTp> static inline
@@ -667,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);
 
 ////////////////// forward declarations for important OpenCV types //////////////////
 
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 3498b0918b..1d933b5c30 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -56,23 +56,7 @@
 #undef abs
 #undef Complex
 
-#if defined __ICL
-#  define CV_ICC   __ICL
-#elif defined __ICC
-#  define CV_ICC   __ICC
-#elif defined __ECL
-#  define CV_ICC   __ECL
-#elif defined __ECC
-#  define CV_ICC   __ECC
-#elif defined __INTEL_COMPILER
-#  define CV_ICC   __INTEL_COMPILER
-#endif
-
-#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
-#  define CV_ENABLE_UNROLLED 0
-#else
-#  define CV_ENABLE_UNROLLED 1
-#endif
+#include "opencv2/hal/defs.h"
 
 #ifdef __OPENCV_BUILD
 #  define DISABLE_OPENCV_24_COMPATIBILITY
@@ -86,16 +70,6 @@
 #  define CV_EXPORTS
 #endif
 
-#ifndef CV_INLINE
-#  if defined __cplusplus
-#    define CV_INLINE static inline
-#  elif defined _MSC_VER
-#    define CV_INLINE __inline
-#  else
-#    define CV_INLINE static
-#  endif
-#endif
-
 #ifndef CV_EXTERN_C
 #  ifdef __cplusplus
 #    define CV_EXTERN_C extern "C"
@@ -104,216 +78,6 @@
 #  endif
 #endif
 
-/* CPU features and intrinsics support */
-#define CV_CPU_NONE             0
-#define CV_CPU_MMX              1
-#define CV_CPU_SSE              2
-#define CV_CPU_SSE2             3
-#define CV_CPU_SSE3             4
-#define CV_CPU_SSSE3            5
-#define CV_CPU_SSE4_1           6
-#define CV_CPU_SSE4_2           7
-#define CV_CPU_POPCNT           8
-
-#define CV_CPU_AVX              10
-#define CV_CPU_AVX2             11
-#define CV_CPU_FMA3             12
-
-#define CV_CPU_AVX_512F         13
-#define CV_CPU_AVX_512BW        14
-#define CV_CPU_AVX_512CD        15
-#define CV_CPU_AVX_512DQ        16
-#define CV_CPU_AVX_512ER        17
-#define CV_CPU_AVX_512IFMA512   18
-#define CV_CPU_AVX_512PF        19
-#define CV_CPU_AVX_512VBMI      20
-#define CV_CPU_AVX_512VL        21
-
-#define CV_CPU_NEON   100
-
-// when adding to this list remember to update the enum in core/utility.cpp
-#define CV_HARDWARE_MAX_FEATURE 255
-
-// do not include SSE/AVX/NEON headers for NVCC compiler
-#ifndef __CUDACC__
-
-#if defined __SSE2__ || defined _M_X64  || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  include <emmintrin.h>
-#  define CV_MMX 1
-#  define CV_SSE 1
-#  define CV_SSE2 1
-#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <pmmintrin.h>
-#    define CV_SSE3 1
-#  endif
-#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <tmmintrin.h>
-#    define CV_SSSE3 1
-#  endif
-#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <smmintrin.h>
-#    define CV_SSE4_1 1
-#  endif
-#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <nmmintrin.h>
-#    define CV_SSE4_2 1
-#  endif
-#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    ifdef _MSC_VER
-#      include <nmmintrin.h>
-#    else
-#      include <popcntintrin.h>
-#    endif
-#    define CV_POPCNT 1
-#  endif
-#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
-// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
-// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
-#    include <immintrin.h>
-#    define CV_AVX 1
-#    if defined(_XCR_XFEATURE_ENABLED_MASK)
-#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-#    else
-#      define __xgetbv() 0
-#    endif
-#  endif
-#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
-#    include <immintrin.h>
-#    define CV_AVX2 1
-#    if defined __FMA__
-#      define CV_FMA3 1
-#    endif
-#  endif
-#endif
-
-#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
-# include <Intrin.h>
-# include "arm_neon.h"
-# define CV_NEON 1
-# define CPU_HAS_NEON_FEATURE (true)
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
-#  include <arm_neon.h>
-#  define CV_NEON 1
-#endif
-
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__)
-#  define CV_VFP 1
-#endif
-
-#endif // __CUDACC__
-
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
-#ifndef CV_MMX
-#  define CV_MMX 0
-#endif
-#ifndef CV_SSE
-#  define CV_SSE 0
-#endif
-#ifndef CV_SSE2
-#  define CV_SSE2 0
-#endif
-#ifndef CV_SSE3
-#  define CV_SSE3 0
-#endif
-#ifndef CV_SSSE3
-#  define CV_SSSE3 0
-#endif
-#ifndef CV_SSE4_1
-#  define CV_SSE4_1 0
-#endif
-#ifndef CV_SSE4_2
-#  define CV_SSE4_2 0
-#endif
-#ifndef CV_AVX
-#  define CV_AVX 0
-#endif
-#ifndef CV_AVX2
-#  define CV_AVX2 0
-#endif
-#ifndef CV_FMA3
-#  define CV_FMA3 0
-#endif
-#ifndef CV_AVX_512F
-#  define CV_AVX_512F 0
-#endif
-#ifndef CV_AVX_512BW
-#  define CV_AVX_512BW 0
-#endif
-#ifndef CV_AVX_512CD
-#  define CV_AVX_512CD 0
-#endif
-#ifndef CV_AVX_512DQ
-#  define CV_AVX_512DQ 0
-#endif
-#ifndef CV_AVX_512ER
-#  define CV_AVX_512ER 0
-#endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
-#endif
-#ifndef CV_AVX_512PF
-#  define CV_AVX_512PF 0
-#endif
-#ifndef CV_AVX_512VBMI
-#  define CV_AVX_512VBMI 0
-#endif
-#ifndef CV_AVX_512VL
-#  define CV_AVX_512VL 0
-#endif
-
-#ifndef CV_NEON
-#  define CV_NEON 0
-#endif
-
-#ifndef CV_VFP
-#  define CV_VFP 0
-#endif
-
-/* primitive types */
-/*
-  schar  - signed 1 byte integer
-  uchar  - unsigned 1 byte integer
-  short  - signed 2 byte integer
-  ushort - unsigned 2 byte integer
-  int    - signed 4 byte integer
-  uint   - unsigned 4 byte integer
-  int64  - signed 8 byte integer
-  uint64 - unsigned 8 byte integer
-*/
-
-#if !defined _MSC_VER && !defined __BORLANDC__
-#  if defined __cplusplus && __cplusplus >= 201103L
-#    include <cstdint>
-     typedef std::uint32_t uint;
-#  else
-#    include <stdint.h>
-     typedef uint32_t uint;
-#  endif
-#else
-   typedef unsigned uint;
-#endif
-
-typedef signed char schar;
-
-#ifndef __IPL_H__
-   typedef unsigned char uchar;
-   typedef unsigned short ushort;
-#endif
-
-#if defined _MSC_VER || defined __BORLANDC__
-   typedef __int64 int64;
-   typedef unsigned __int64 uint64;
-#  define CV_BIG_INT(n)   n##I64
-#  define CV_BIG_UINT(n)  n##UI64
-#else
-   typedef int64_t int64;
-   typedef uint64_t uint64;
-#  define CV_BIG_INT(n)   n##LL
-#  define CV_BIG_UINT(n)  n##ULL
-#endif
-
 /* special informative macros for wrapper generators */
 #define CV_EXPORTS_W CV_EXPORTS
 #define CV_EXPORTS_W_SIMPLE CV_EXPORTS
@@ -326,11 +90,6 @@ typedef signed char schar;
 #define CV_WRAP
 #define CV_WRAP_AS(synonym)
 
-/* fundamental constants */
-#define CV_PI   3.1415926535897932384626433832795
-#define CV_2PI 6.283185307179586476925286766559
-#define CV_LOG2 0.69314718055994530941723212145818
-
 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
 \****************************************************************************************/
@@ -417,19 +176,6 @@ typedef signed char schar;
 #define CV_ELEM_SIZE(type) \
     (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
 
-
-/****************************************************************************************\
-*                                      fast math                                         *
-\****************************************************************************************/
-
-#if defined __BORLANDC__
-#  include <fastmath.h>
-#elif defined __cplusplus
-#  include <cmath>
-#else
-#  include <math.h>
-#endif
-
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))
 #endif
@@ -438,164 +184,6 @@ typedef signed char schar;
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
-//! @addtogroup core_utils
-//! @{
-
-#if CV_VFP
-// 1. general scheme
-#define ARM_ROUND(_value, _asm_string) \
-    int res; \
-    float temp; \
-    asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
-    return res;
-// 2. version for double
-#ifdef __clang__
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
-#else
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
-#endif
-// 3. version for float
-#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
-
-/** @brief Rounds floating-point number to the nearest integer
-
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvRound( double value )
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    return _mm_cvtsd_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_DBL(value)
-# else
-    return (int)lrint(value);
-# endif
-#else
-    double intpart, fractpart;
-    fractpart = modf(value, &intpart);
-    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
-    else
-        return (int)intpart;
-#endif
-}
-
-#ifdef __cplusplus
-
-/** @overload */
-CV_INLINE int cvRound(float value)
-{
-#if defined ANDROID && (defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif CV_VFP && !defined HAVE_TEGRA_OPTIMIZATION
-    ARM_ROUND_FLT(value)
-#else
-    return cvRound((double)value);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvRound(int value)
-{
-    return value;
-}
-
-#endif // __cplusplus
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvFloor( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
-*/
-CV_INLINE int cvCeil( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @brief Determines if the argument is Not A Number.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
-otherwise. */
-CV_INLINE int cvIsNaN( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
-           ((unsigned)ieee754.u != 0) > 0x7ff00000;
-}
-
-/** @brief Determines if the argument is Infinity.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
-and 0 otherwise. */
-CV_INLINE int cvIsInf( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
-           (unsigned)ieee754.u == 0;
-}
-
-//! @} core_utils
-
 /****************************************************************************************\
 *          exchange-add operation for atomic operations on reference counters            *
 \****************************************************************************************/
diff --git a/modules/core/include/opencv2/core/ippasync.hpp b/modules/core/include/opencv2/core/ippasync.hpp
index 2fce5d5b93..4de8611dbe 100644
--- a/modules/core/include/opencv2/core/ippasync.hpp
+++ b/modules/core/include/opencv2/core/ippasync.hpp
@@ -1,3 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 #ifndef __OPENCV_CORE_IPPASYNC_HPP__
 #define __OPENCV_CORE_IPPASYNC_HPP__
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 8b0d94f6e1..315d498c5a 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -185,39 +185,43 @@ public:
     _InputArray(const UMat& um);
     _InputArray(const std::vector<UMat>& umv);
 
-    virtual Mat getMat(int idx=-1) const;
-    virtual UMat getUMat(int idx=-1) const;
-    virtual void getMatVector(std::vector<Mat>& mv) const;
-    virtual void getUMatVector(std::vector<UMat>& umv) const;
-    virtual cuda::GpuMat getGpuMat() const;
-    virtual ogl::Buffer getOGlBuffer() const;
+    Mat getMat(int idx=-1) const;
+    Mat getMat_(int idx=-1) const;
+    UMat getUMat(int idx=-1) const;
+    void getMatVector(std::vector<Mat>& mv) const;
+    void getUMatVector(std::vector<UMat>& umv) const;
+    cuda::GpuMat getGpuMat() const;
+    ogl::Buffer getOGlBuffer() const;
+
+    int getFlags() const;
     void* getObj() const;
-
-    virtual int kind() const;
-    virtual int dims(int i=-1) const;
-    virtual int cols(int i=-1) const;
-    virtual int rows(int i=-1) const;
-    virtual Size size(int i=-1) const;
-    virtual int sizend(int* sz, int i=-1) const;
-    virtual bool sameSize(const _InputArray& arr) const;
-    virtual size_t total(int i=-1) const;
-    virtual int type(int i=-1) const;
-    virtual int depth(int i=-1) const;
-    virtual int channels(int i=-1) const;
-    virtual bool isContinuous(int i=-1) const;
-    virtual bool isSubmatrix(int i=-1) const;
-    virtual bool empty() const;
-    virtual void copyTo(const _OutputArray& arr) const;
-    virtual void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
-    virtual size_t offset(int i=-1) const;
-    virtual size_t step(int i=-1) const;
+    Size getSz() const;
+
+    int kind() const;
+    int dims(int i=-1) const;
+    int cols(int i=-1) const;
+    int rows(int i=-1) const;
+    Size size(int i=-1) const;
+    int sizend(int* sz, int i=-1) const;
+    bool sameSize(const _InputArray& arr) const;
+    size_t total(int i=-1) const;
+    int type(int i=-1) const;
+    int depth(int i=-1) const;
+    int channels(int i=-1) const;
+    bool isContinuous(int i=-1) const;
+    bool isSubmatrix(int i=-1) const;
+    bool empty() const;
+    void copyTo(const _OutputArray& arr) const;
+    void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
+    size_t offset(int i=-1) const;
+    size_t step(int i=-1) const;
     bool isMat() const;
     bool isUMat() const;
     bool isMatVector() const;
     bool isUMatVector() const;
     bool isMatx() const;
 
-    virtual ~_InputArray();
+    ~_InputArray();
 
 protected:
     int flags;
@@ -303,21 +307,21 @@ public:
     _OutputArray(const UMat& m);
     _OutputArray(const std::vector<UMat>& vec);
 
-    virtual bool fixedSize() const;
-    virtual bool fixedType() const;
-    virtual bool needed() const;
-    virtual Mat& getMatRef(int i=-1) const;
-    virtual UMat& getUMatRef(int i=-1) const;
-    virtual cuda::GpuMat& getGpuMatRef() const;
-    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual cuda::HostMem& getHostMemRef() const;
-    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void createSameSize(const _InputArray& arr, int mtype) const;
-    virtual void release() const;
-    virtual void clear() const;
-    virtual void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
+    bool fixedSize() const;
+    bool fixedType() const;
+    bool needed() const;
+    Mat& getMatRef(int i=-1) const;
+    UMat& getUMatRef(int i=-1) const;
+    cuda::GpuMat& getGpuMatRef() const;
+    ogl::Buffer& getOGlBufferRef() const;
+    cuda::HostMem& getHostMemRef() const;
+    void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void createSameSize(const _InputArray& arr, int mtype) const;
+    void release() const;
+    void clear() const;
+    void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
 
     void assign(const UMat& u) const;
     void assign(const Mat& m) const;
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index cb39c15fb4..535baa156d 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -7,11 +7,13 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -61,6 +63,8 @@ inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
 { flags = _flags; obj = (void*)_obj; sz = _sz; }
 
 inline void* _InputArray::getObj() const { return obj; }
+inline int _InputArray::getFlags() const { return flags; }
+inline Size _InputArray::getSz() const { return sz; }
 
 inline _InputArray::_InputArray() { init(NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
@@ -110,6 +114,13 @@ inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
 
 inline _InputArray::~_InputArray() {}
 
+inline Mat _InputArray::getMat(int i) const
+{
+    if( kind() == MAT && i < 0 )
+        return *(const Mat*)obj;
+    return getMat_(i);
+}
+
 inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
 inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
 inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 6cc5d06251..e9023243e8 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
     double operator ()(const Matx<_Tp, m, m>& a) const
     {
         Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
         if( p == 0 )
             return p;
         for( int i = 0; i < m; i++ )
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index 067140abb3..2c42e1f3a3 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -12,6 +12,8 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -70,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
             b(i, i) = (_Tp)1;
 
         if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
 
-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
     }
 };
 
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 58d78e5848..4f9f487778 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -136,14 +136,6 @@ namespace cv
 /* the alignment of all the allocated buffers */
 #define  CV_MALLOC_ALIGN    16
 
-#ifdef __GNUC__
-#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
-#elif defined _MSC_VER
-#  define CV_DECL_ALIGNED(x) __declspec(align(x))
-#else
-#  define CV_DECL_ALIGNED(x)
-#endif
-
 /* IEEE754 constants and macros */
 #define  CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
 #define  CV_TOGGLE_DBL(x) ((x)^((int64)(x) < 0 ? CV_BIG_INT(0x7fffffffffffffff) : 0))
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 16e613053d..cb39587a9a 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -113,22 +113,6 @@ bytes of the header. In C++ interface the role of CvArr is played by InputArray
  */
 typedef void CvArr;
 
-typedef union Cv32suf
-{
-    int i;
-    unsigned u;
-    float f;
-}
-Cv32suf;
-
-typedef union Cv64suf
-{
-    int64 i;
-    uint64 u;
-    double f;
-}
-Cv64suf;
-
 typedef int CVStatus;
 
 /** @see cv::Error::Code */
diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index f21293ad0b..4a46f7930e 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -10,8 +10,10 @@
 //                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright( C) 2000, Intel Corporation, all rights reserved.
+// Copyright( C) 2000-2015, Intel Corporation, all rights reserved.
 // Copyright (C) 2011-2013, NVIDIA Corporation, all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/core/src/algorithm.cpp b/modules/core/src/algorithm.cpp
index b10a28988a..b930428719 100644
--- a/modules/core/src/algorithm.cpp
+++ b/modules/core/src/algorithm.cpp
@@ -53,6 +53,20 @@ Algorithm::~Algorithm()
 {
 }
 
+void Algorithm::save(const String& filename) const
+{
+    FileStorage fs(filename, FileStorage::WRITE);
+    fs << getDefaultName() << "{";
+    fs << "format" << (int)3;
+    write(fs);
+    fs << "}";
+}
+
+String Algorithm::getDefaultName() const
+{
+    return String("my_object");
+}
+
 }
 
 /* End of file. */
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index c0067f8fc4..519d00ee53 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -651,7 +651,7 @@ icvGrowSeq( CvSeq *seq, int in_front_of )
         /* If there is a free space just after last allocated block
            and it is big enough then enlarge the last block.
            This can happen only if the new block is added to the end of sequence: */
-        if( (unsigned)(ICV_FREE_PTR(storage) - seq->block_max) < CV_STRUCT_ALIGN &&
+        if( (size_t)(ICV_FREE_PTR(storage) - seq->block_max) < CV_STRUCT_ALIGN &&
             storage->free_space >= seq->elem_size && !in_front_of )
         {
             int delta = storage->free_space / elem_size;
diff --git a/modules/core/src/kmeans.cpp b/modules/core/src/kmeans.cpp
index cc86d2972d..fe5a0cf6e9 100644
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@@ -79,7 +79,7 @@ public:
 
         for ( int i = begin; i<end; i++ )
         {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
+            tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
         }
     }
 
@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
 
     for( i = 0; i < N; i++ )
     {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
         sum0 += dist[i];
     }
 
@@ -189,7 +189,7 @@ public:
             for( int k = 0; k < K; k++ )
             {
                 const float* center = centers.ptr<float>(k);
-                const double dist = normL2Sqr_(sample, center, dims);
+                const double dist = normL2Sqr(sample, center, dims);
 
                 if( min_dist > dist )
                 {
@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
                         if( labels[i] != max_k )
                             continue;
                         sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
+                        double dist = normL2Sqr(sample, _old_center, dims);
 
                         if( max_dist <= dist )
                         {
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index a766e5f2ed..dea25dd64c 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -50,168 +50,6 @@
 namespace cv
 {
 
-/****************************************************************************************\
-*                     LU & Cholesky implementation for small matrices                    *
-\****************************************************************************************/
-
-template<typename _Tp> static inline int
-LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    int i, j, k, p = 1;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        k = i;
-
-        for( j = i+1; j < m; j++ )
-            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
-                k = j;
-
-        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
-            return 0;
-
-        if( k != i )
-        {
-            for( j = i; j < m; j++ )
-                std::swap(A[i*astep + j], A[k*astep + j]);
-            if( b )
-                for( j = 0; j < n; j++ )
-                    std::swap(b[i*bstep + j], b[k*bstep + j]);
-            p = -p;
-        }
-
-        _Tp d = -1/A[i*astep + i];
-
-        for( j = i+1; j < m; j++ )
-        {
-            _Tp alpha = A[j*astep + i]*d;
-
-            for( k = i+1; k < m; k++ )
-                A[j*astep + k] += alpha*A[i*astep + k];
-
-            if( b )
-                for( k = 0; k < n; k++ )
-                    b[j*bstep + k] += alpha*b[i*bstep + k];
-        }
-
-        A[i*astep + i] = -d;
-    }
-
-    if( b )
-    {
-        for( i = m-1; i >= 0; i-- )
-            for( j = 0; j < n; j++ )
-            {
-                _Tp s = b[i*bstep + j];
-                for( k = i+1; k < m; k++ )
-                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
-            }
-    }
-
-    return p;
-}
-
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-template<typename _Tp> static inline bool
-CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    _Tp* L = A;
-    int i, j, k;
-    double s;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < i; j++ )
-        {
-            s = A[i*astep + j];
-            for( k = 0; k < j; k++ )
-                s -= L[i*astep + k]*L[j*astep + k];
-            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
-        }
-        s = A[i*astep + i];
-        for( k = 0; k < j; k++ )
-        {
-            double t = L[i*astep + k];
-            s -= t*t;
-        }
-        if( s < std::numeric_limits<_Tp>::epsilon() )
-            return false;
-        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
-    }
-
-    if( !b )
-        return true;
-
-    // LLt x = b
-    // 1: L y = b
-    // 2. Lt x = y
-
-    /*
-     [ L00             ]  y0   b0
-     [ L10 L11         ]  y1 = b1
-     [ L20 L21 L22     ]  y2   b2
-     [ L30 L31 L32 L33 ]  y3   b3
-
-     [ L00 L10 L20 L30 ]  x0   y0
-     [     L11 L21 L31 ]  x1 = y1
-     [         L22 L32 ]  x2   y2
-     [             L33 ]  x3   y3
-    */
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = 0; k < i; k++ )
-                s -= L[i*astep + k]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    for( i = m-1; i >= 0; i-- )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = m-1; k > i; k-- )
-                s -= L[k*astep + i]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    return true;
-}
-
-
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
     a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
             Mat a(rows, rows, CV_32F, (uchar*)buffer);
             mat.copyTo(a);
 
-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
             if( result )
             {
                 for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
             Mat a(rows, rows, CV_64F, (uchar*)buffer);
             mat.copyTo(a);
 
-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
             if( result )
             {
                 for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
     setIdentity(dst);
 
     if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
     else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
     else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
     else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
 
     if( !result )
         dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
     if( method == DECOMP_LU )
     {
         if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
         else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
     }
     else if( method == DECOMP_CHOLESKY )
     {
         if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
         else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
     }
     else
     {
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 446b62731b..e96eaeb41a 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -121,107 +121,6 @@ float fastAtan2( float y, float x )
     return a;
 }
 
-static void FastAtan2_32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees=true )
-{
-    int i = 0;
-    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
-        return;
-#endif
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
-        __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
-        __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
-        __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
-        __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
-        __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
-
-        for( ; i <= len - 4; i += 4 )
-        {
-            __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
-            __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
-            __m128 mask = _mm_cmplt_ps(ax, ay);
-            __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
-            __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
-            __m128 c2 = _mm_mul_ps(c, c);
-            __m128 a = _mm_mul_ps(c2, p7);
-            a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
-            a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
-            a = _mm_mul_ps(_mm_add_ps(a, p1), c);
-
-            __m128 b = _mm_sub_ps(_90, a);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            b = _mm_sub_ps(_180, a);
-            mask = _mm_cmplt_ps(x, z);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            b = _mm_sub_ps(_360, a);
-            mask = _mm_cmplt_ps(y, z);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            a = _mm_mul_ps(a, scale4);
-            _mm_storeu_ps(angle + i, a);
-        }
-    }
-#elif CV_NEON
-    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
-    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
-    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
-    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
-    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
-
-    for( ; i <= len - 4; i += 4 )
-    {
-        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
-        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
-        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
-        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
-        float32x4_t c2 = vmulq_f32(c, c);
-        float32x4_t a = vmulq_f32(c2, p7);
-        a = vmulq_f32(vaddq_f32(a, p5), c2);
-        a = vmulq_f32(vaddq_f32(a, p3), c2);
-        a = vmulq_f32(vaddq_f32(a, p1), c);
-
-        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
-        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
-        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
-
-        vst1q_f32(angle + i, vmulq_f32(a, scale4));
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        float x = X[i], y = Y[i];
-        float ax = std::abs(x), ay = std::abs(y);
-        float a, c, c2;
-        if( ax >= ay )
-        {
-            c = ay/(ax + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        else
-        {
-            c = ax/(ay + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        if( x < 0 )
-            a = 180.f - a;
-        if( y < 0 )
-            a = 360.f - a;
-        angle[i] = (float)(a*scale);
-    }
-}
-
-
 /* ************************************************************************** *\
    Fast cube root by Ken Turkowski
    (http://www.worldserver.com/turk/computergraphics/papers.html)
@@ -263,255 +162,6 @@ float  cubeRoot( float value )
     return v.f;
 }
 
-static void Magnitude_32f(const float* x, const float* y, float* mag, int len)
-{
-#if defined HAVE_IPP && 0
-    CV_IPP_CHECK()
-    {
-        IppStatus status = ippsMagnitude_32f(x, y, mag, len);
-        if (status >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        for( ; i <= len - 8; i += 8 )
-        {
-            __m128 x0 = _mm_loadu_ps(x + i), x1 = _mm_loadu_ps(x + i + 4);
-            __m128 y0 = _mm_loadu_ps(y + i), y1 = _mm_loadu_ps(y + i + 4);
-            x0 = _mm_add_ps(_mm_mul_ps(x0, x0), _mm_mul_ps(y0, y0));
-            x1 = _mm_add_ps(_mm_mul_ps(x1, x1), _mm_mul_ps(y1, y1));
-            x0 = _mm_sqrt_ps(x0); x1 = _mm_sqrt_ps(x1);
-            _mm_storeu_ps(mag + i, x0); _mm_storeu_ps(mag + i + 4, x1);
-        }
-    }
-#elif CV_NEON
-    for( ; i <= len - 4; i += 4 )
-    {
-        float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i);
-        vst1q_f32(mag + i, cv_vsqrtq_f32(vmlaq_f32(vmulq_f32(v_x, v_x), v_y, v_y)));
-    }
-    for( ; i <= len - 2; i += 2 )
-    {
-        float32x2_t v_x = vld1_f32(x + i), v_y = vld1_f32(y + i);
-        vst1_f32(mag + i, cv_vsqrt_f32(vmla_f32(vmul_f32(v_x, v_x), v_y, v_y)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        float x0 = x[i], y0 = y[i];
-        mag[i] = std::sqrt(x0*x0 + y0*y0);
-    }
-}
-
-static void Magnitude_64f(const double* x, const double* y, double* mag, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        IppStatus status = ippsMagnitude_64f(x, y, mag, len);
-        if (status >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        for( ; i <= len - 4; i += 4 )
-        {
-            __m128d x0 = _mm_loadu_pd(x + i), x1 = _mm_loadu_pd(x + i + 2);
-            __m128d y0 = _mm_loadu_pd(y + i), y1 = _mm_loadu_pd(y + i + 2);
-            x0 = _mm_add_pd(_mm_mul_pd(x0, x0), _mm_mul_pd(y0, y0));
-            x1 = _mm_add_pd(_mm_mul_pd(x1, x1), _mm_mul_pd(y1, y1));
-            x0 = _mm_sqrt_pd(x0); x1 = _mm_sqrt_pd(x1);
-            _mm_storeu_pd(mag + i, x0); _mm_storeu_pd(mag + i + 2, x1);
-        }
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        double x0 = x[i], y0 = y[i];
-        mag[i] = std::sqrt(x0*x0 + y0*y0);
-    }
-}
-
-
-static void InvSqrt_32f(const float* src, float* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsInvSqrt_32f_A21(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4);
-                __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5);
-                t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1);
-                t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0)));
-                t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1)));
-                _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1);
-            }
-        else
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4);
-                __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5);
-                t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1);
-                t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0)));
-                t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1)));
-                _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
-            }
-    }
-#elif CV_NEON
-    for ( ; i <= len - 8; i += 8)
-    {
-        vst1q_f32(dst + i, cv_vrsqrtq_f32(vld1q_f32(src + i)));
-        vst1q_f32(dst + i + 4, cv_vrsqrtq_f32(vld1q_f32(src + i + 4)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = 1/std::sqrt(src[i]);
-}
-
-
-static void InvSqrt_64f(const double* src, double* dst, int len)
-{
-    int i = 0;
-
-#if CV_SSE2
-    if (USE_SSE2)
-    {
-        __m128d v_1 = _mm_set1_pd(1.0);
-        for ( ; i <= len - 2; i += 2)
-            _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = 1/std::sqrt(src[i]);
-}
-
-
-static void Sqrt_32f(const float* src, float* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsSqrt_32f_A21(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4);
-                t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1);
-                _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1);
-            }
-        else
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4);
-                t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1);
-                _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
-            }
-    }
-#elif CV_NEON
-    for ( ; i <= len - 8; i += 8)
-    {
-        vst1q_f32(dst + i, cv_vsqrtq_f32(vld1q_f32(src + i)));
-        vst1q_f32(dst + i + 4, cv_vsqrtq_f32(vld1q_f32(src + i + 4)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = std::sqrt(src[i]);
-}
-
-
-static void Sqrt_64f(const double* src, double* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsSqrt_64f_A50(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 4; i += 4 )
-            {
-                __m128d t0 = _mm_load_pd(src + i), t1 = _mm_load_pd(src + i + 2);
-                t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1);
-                _mm_store_pd(dst + i, t0); _mm_store_pd(dst + i + 2, t1);
-            }
-        else
-            for( ; i <= len - 4; i += 4 )
-            {
-                __m128d t0 = _mm_loadu_pd(src + i), t1 = _mm_loadu_pd(src + i + 2);
-                t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1);
-                _mm_storeu_pd(dst + i, t0); _mm_storeu_pd(dst + i + 2, t1);
-            }
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = std::sqrt(src[i]);
-}
-
-
 /****************************************************************************************\
 *                                  Cartezian -> Polar                                    *
 \****************************************************************************************/
@@ -539,13 +189,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
         {
             const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
             float *mag = (float*)ptrs[2];
-            Magnitude_32f( x, y, mag, len );
+            hal::magnitude( x, y, mag, len );
         }
         else
         {
             const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
             double *mag = (double*)ptrs[2];
-            Magnitude_64f( x, y, mag, len );
+            hal::magnitude( x, y, mag, len );
         }
     }
 }
@@ -588,7 +238,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *angle = (float*)ptrs[2];
-                FastAtan2_32f( y, x, angle, len, angleInDegrees );
+                hal::fastAtan2( y, x, angle, len, angleInDegrees );
             }
             else
             {
@@ -618,7 +268,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
                     buf[1][k] = (float)y[k];
                 }
 
-                FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
+                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
                 k = 0;
 
 #if CV_SSE2
@@ -722,15 +372,15 @@ void cartToPolar( InputArray src1, InputArray src2,
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
-                Magnitude_32f( x, y, mag, len );
-                FastAtan2_32f( y, x, angle, len, angleInDegrees );
+                hal::magnitude( x, y, mag, len );
+                hal::fastAtan2( y, x, angle, len, angleInDegrees );
             }
             else
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                 double *angle = (double*)ptrs[3];
 
-                Magnitude_64f(x, y, (double*)ptrs[2], len);
+                hal::magnitude(x, y, (double*)ptrs[2], len);
                 k = 0;
 
 #if CV_SSE2
@@ -755,7 +405,7 @@ void cartToPolar( InputArray src1, InputArray src2,
                     buf[1][k] = (float)y[k];
                 }
 
-                FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
+                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
                 k = 0;
 
 #if CV_SSE2
@@ -1096,482 +746,6 @@ void polarToCart( InputArray src1, InputArray src2,
 *                                          E X P                                         *
 \****************************************************************************************/
 
-typedef union
-{
-    struct {
-#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
-        int hi;
-        int lo;
-#else
-        int lo;
-        int hi;
-#endif
-    } i;
-    double d;
-}
-DBLINT;
-
-#define EXPTAB_SCALE 6
-#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
-
-#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
-
-static const double expTab[] = {
-    1.0 * EXPPOLY_32F_A0,
-    1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
-    1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
-    1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
-    1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
-    1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
-    1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
-    1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
-    1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
-    1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
-    1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
-    1.126521618608241899794798643787 * EXPPOLY_32F_A0,
-    1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
-    1.151189229952982705817759635202 * EXPPOLY_32F_A0,
-    1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
-    1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
-    1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
-    1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
-    1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
-    1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
-    1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
-    1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
-    1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
-    1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
-    1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
-    1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
-    1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
-    1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
-    1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
-    1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
-    1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
-    1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
-    1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
-    1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
-    1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
-    1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
-    1.476826145939499311386907480374 * EXPPOLY_32F_A0,
-    1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
-    1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
-    1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
-    1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
-    1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
-    1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
-    1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
-    1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
-    1.628027421857347766848218522014 * EXPPOLY_32F_A0,
-    1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
-    1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
-    1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
-    1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
-    1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
-    1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
-    1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
-    1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
-    1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
-    1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
-    1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
-    1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
-    1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
-    1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
-    1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
-    1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
-    1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
-    1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
-};
-
-
-// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
-#if (defined _MSC_VER && _MSC_VER < 1500) || \
-    (!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
-#undef CV_SSE2
-#define CV_SSE2 0
-#endif
-
-static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
-static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
-static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
-
-static void Exp_32f( const float *_x, float *y, int n )
-{
-    static const float
-        A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
-        A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
-        A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
-        A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
-
-#undef EXPPOLY
-#define EXPPOLY(x)  \
-    (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
-
-    int i = 0;
-    const Cv32suf* x = (const Cv32suf*)_x;
-    Cv32suf buf[4];
-
-#if CV_SSE2
-    if( n >= 8 && USE_SSE2 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-        static const __m128 mA3 = _mm_set1_ps(A3);
-        static const __m128 mA4 = _mm_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 16 == 0;
-
-        ushort CV_DECL_ALIGNED(16) tab_idx[8];
-
-        for( ; i <= n - 8; i += 8 )
-        {
-            __m128 xf0, xf1;
-            xf0 = _mm_loadu_ps(&x[i].f);
-            xf1 = _mm_loadu_ps(&x[i+4].f);
-            __m128i xi0, xi1, xi2, xi3;
-
-            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
-            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
-
-            __m128d xd0 = _mm_cvtps_pd(xf0);
-            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
-            __m128d xd1 = _mm_cvtps_pd(xf1);
-            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
-
-            xd0 = _mm_mul_pd(xd0, prescale2);
-            xd2 = _mm_mul_pd(xd2, prescale2);
-            xd1 = _mm_mul_pd(xd1, prescale2);
-            xd3 = _mm_mul_pd(xd3, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xd0);
-            xi2 = _mm_cvtpd_epi32(xd2);
-
-            xi1 = _mm_cvtpd_epi32(xd1);
-            xi3 = _mm_cvtpd_epi32(xd3);
-
-            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
-            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
-            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
-            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
-
-            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
-            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
-
-            xf0 = _mm_mul_ps(xf0, postscale4);
-            xf1 = _mm_mul_ps(xf1, postscale4);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi2);
-            xi1 = _mm_unpacklo_epi64(xi1, xi3);
-            xi0 = _mm_packs_epi32(xi0, xi1);
-
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-
-            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
-            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
-
-            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
-            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
-
-            __m128 zf0 = _mm_add_ps(xf0, mA1);
-            __m128 zf1 = _mm_add_ps(xf1, mA1);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
-
-            zf0 = _mm_mul_ps(zf0, yf0);
-            zf1 = _mm_mul_ps(zf1, yf1);
-
-            if( y_aligned )
-            {
-                _mm_store_ps(y + i, zf0);
-                _mm_store_ps(y + i + 4, zf1);
-            }
-            else
-            {
-                _mm_storeu_ps(y + i, zf0);
-                _mm_storeu_ps(y + i + 4, zf1);
-            }
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
-        int val0, val1, val2, val3, t;
-
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
-            x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
-            x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
-            x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
-
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
-
-        t = (val0 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[0].i = t << 23;
-
-        t = (val1 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[1].i = t << 23;
-
-        t = (val2 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[2].i = t << 23;
-
-        t = (val3 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[3].i = t << 23;
-
-        x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-        y[i] = (float)x0;
-        y[i + 1] = (float)x1;
-
-        x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-        y[i + 2] = (float)x2;
-        y[i + 3] = (float)x3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
-
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-
-        buf[0].i = t << 23;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
-    }
-}
-
-
-static void Exp_64f( const double *_x, double *y, int n )
-{
-    static const double
-    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
-    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
-    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
-    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
-    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
-    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
-
-#undef EXPPOLY
-#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
-    int i = 0;
-    Cv64suf buf[4];
-    const Cv64suf* x = (const Cv64suf*)_x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
-        static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
-        static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
-
-        static const __m128d mA0 = _mm_set1_pd(A0);
-        static const __m128d mA1 = _mm_set1_pd(A1);
-        static const __m128d mA2 = _mm_set1_pd(A2);
-        static const __m128d mA3 = _mm_set1_pd(A3);
-        static const __m128d mA4 = _mm_set1_pd(A4);
-        static const __m128d mA5 = _mm_set1_pd(A5);
-
-        int CV_DECL_ALIGNED(16) tab_idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
-            __m128i xi0, xi1;
-            xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
-            xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
-            xf0 = _mm_mul_pd(xf0, prescale2);
-            xf1 = _mm_mul_pd(xf1, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xf0);
-            xi1 = _mm_cvtpd_epi32(xf1);
-            xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
-            xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi1);
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
-            xi0 = _mm_packs_epi32(xi0, xi0);
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-            xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
-
-            __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
-            yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
-
-            __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
-            __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
-
-            zf0 = _mm_mul_pd(zf0, yf0);
-            zf1 = _mm_mul_pd(zf1, yf1);
-
-            _mm_storeu_pd(y + i, zf0);
-            _mm_storeu_pd(y + i + 2, zf1);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
-
-        double y0, y1, y2, y3;
-        int val0, val1, val2, val3, t;
-
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+1].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x1 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+2].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x2 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+3].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x3 = t < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
-
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
-
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[0].i = (int64)t << 52;
-
-        t = (val1 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[1].i = (int64)t << 52;
-
-        t = (val2 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[2].i = (int64)t << 52;
-
-        t = (val3 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[3].i = (int64)t << 52;
-
-        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-        y[i] = y0;
-        y[i + 1] = y1;
-
-        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-        y[i + 2] = y2;
-        y[i + 3] = y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
-
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-
-        buf[0].i = (int64)t << 52;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-    }
-}
-
-#undef EXPTAB_SCALE
-#undef EXPTAB_MASK
-#undef EXPPOLY_32F_A0
-
 #ifdef HAVE_IPP
 static void Exp_32f_ipp(const float *x, float *y, int n)
 {
@@ -1584,7 +758,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
         }
         setIppErrorStatus();
     }
-    Exp_32f(x, y, n);
+    hal::exp(x, y, n);
 }
 
 static void Exp_64f_ipp(const double *x, double *y, int n)
@@ -1598,11 +772,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
         }
         setIppErrorStatus();
     }
-    Exp_64f(x, y, n);
+    hal::exp(x, y, n);
 }
 
 #define Exp_32f Exp_32f_ipp
 #define Exp_64f Exp_64f_ipp
+#else
+#define Exp_32f hal::exp
+#define Exp_64f hal::exp
 #endif
 
 
@@ -1637,613 +814,6 @@ void exp( InputArray _src, OutputArray _dst )
 *                                          L O G                                         *
 \****************************************************************************************/
 
-#define LOGTAB_SCALE    8
-#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
-#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
-#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
-
-static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
-0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
-.00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
-.00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
-.01165061721997527263705585198749759001657,    .9884169884169884169884169884169884169884,
-.01550418653596525274396267235488267033361,    .9846153846153846153846153846153846153846,
-.01934296284313093139406447562578250654042,    .9808429118773946360153256704980842911877,
-.02316705928153437593630670221500622574241,    .9770992366412213740458015267175572519084,
-.02697658769820207233514075539915211265906,    .9733840304182509505703422053231939163498,
-.03077165866675368732785500469617545604706,    .9696969696969696969696969696969696969697,
-.03455238150665972812758397481047722976656,    .9660377358490566037735849056603773584906,
-.03831886430213659461285757856785494368522,    .9624060150375939849624060150375939849624,
-.04207121392068705056921373852674150839447,    .9588014981273408239700374531835205992509,
-.04580953603129420126371940114040626212953,    .9552238805970149253731343283582089552239,
-.04953393512227662748292900118940451648088,    .9516728624535315985130111524163568773234,
-.05324451451881227759255210685296333394944,    .9481481481481481481481481481481481481481,
-.05694137640013842427411105973078520037234,    .9446494464944649446494464944649446494465,
-.06062462181643483993820353816772694699466,    .9411764705882352941176470588235294117647,
-.06429435070539725460836422143984236754475,    .9377289377289377289377289377289377289377,
-.06795066190850773679699159401934593915938,    .9343065693430656934306569343065693430657,
-.07159365318700880442825962290953611955044,    .9309090909090909090909090909090909090909,
-.07522342123758751775142172846244648098944,    .9275362318840579710144927536231884057971,
-.07884006170777602129362549021607264876369,    .9241877256317689530685920577617328519856,
-.08244366921107458556772229485432035289706,    .9208633093525179856115107913669064748201,
-.08603433734180314373940490213499288074675,    .9175627240143369175627240143369175627240,
-.08961215868968712416897659522874164395031,    .9142857142857142857142857142857142857143,
-.09317722485418328259854092721070628613231,    .9110320284697508896797153024911032028470,
-.09672962645855109897752299730200320482256,    .9078014184397163120567375886524822695035,
-.10026945316367513738597949668474029749630,    .9045936395759717314487632508833922261484,
-.10379679368164355934833764649738441221420,    .9014084507042253521126760563380281690141,
-.10731173578908805021914218968959175981580,    .8982456140350877192982456140350877192982,
-.11081436634029011301105782649756292812530,    .8951048951048951048951048951048951048951,
-.11430477128005862852422325204315711744130,    .8919860627177700348432055749128919860627,
-.11778303565638344185817487641543266363440,    .8888888888888888888888888888888888888889,
-.12124924363286967987640707633545389398930,    .8858131487889273356401384083044982698962,
-.12470347850095722663787967121606925502420,    .8827586206896551724137931034482758620690,
-.12814582269193003360996385708858724683530,    .8797250859106529209621993127147766323024,
-.13157635778871926146571524895989568904040,    .8767123287671232876712328767123287671233,
-.13499516453750481925766280255629681050780,    .8737201365187713310580204778156996587031,
-.13840232285911913123754857224412262439730,    .8707482993197278911564625850340136054422,
-.14179791186025733629172407290752744302150,    .8677966101694915254237288135593220338983,
-.14518200984449788903951628071808954700830,    .8648648648648648648648648648648648648649,
-.14855469432313711530824207329715136438610,    .8619528619528619528619528619528619528620,
-.15191604202584196858794030049466527998450,    .8590604026845637583892617449664429530201,
-.15526612891112392955683674244937719777230,    .8561872909698996655518394648829431438127,
-.15860503017663857283636730244325008243330,    .8533333333333333333333333333333333333333,
-.16193282026931324346641360989451641216880,    .8504983388704318936877076411960132890365,
-.16524957289530714521497145597095368430010,    .8476821192052980132450331125827814569536,
-.16855536102980664403538924034364754334090,    .8448844884488448844884488448844884488449,
-.17185025692665920060697715143760433420540,    .8421052631578947368421052631578947368421,
-.17513433212784912385018287750426679849630,    .8393442622950819672131147540983606557377,
-.17840765747281828179637841458315961062910,    .8366013071895424836601307189542483660131,
-.18167030310763465639212199675966985523700,    .8338762214983713355048859934853420195440,
-.18492233849401198964024217730184318497780,    .8311688311688311688311688311688311688312,
-.18816383241818296356839823602058459073300,    .8284789644012944983818770226537216828479,
-.19139485299962943898322009772527962923050,    .8258064516129032258064516129032258064516,
-.19461546769967164038916962454095482826240,    .8231511254019292604501607717041800643087,
-.19782574332991986754137769821682013571260,    .8205128205128205128205128205128205128205,
-.20102574606059073203390141770796617493040,    .8178913738019169329073482428115015974441,
-.20421554142869088876999228432396193966280,    .8152866242038216560509554140127388535032,
-.20739519434607056602715147164417430758480,    .8126984126984126984126984126984126984127,
-.21056476910734961416338251183333341032260,    .8101265822784810126582278481012658227848,
-.21372432939771812687723695489694364368910,    .8075709779179810725552050473186119873817,
-.21687393830061435506806333251006435602900,    .8050314465408805031446540880503144654088,
-.22001365830528207823135744547471404075630,    .8025078369905956112852664576802507836991,
-.22314355131420973710199007200571941211830,    .8000000000000000000000000000000000000000,
-.22626367865045338145790765338460914790630,    .7975077881619937694704049844236760124611,
-.22937410106484582006380890106811420992010,    .7950310559006211180124223602484472049689,
-.23247487874309405442296849741978803649550,    .7925696594427244582043343653250773993808,
-.23556607131276688371634975283086532726890,    .7901234567901234567901234567901234567901,
-.23864773785017498464178231643018079921600,    .7876923076923076923076923076923076923077,
-.24171993688714515924331749374687206000090,    .7852760736196319018404907975460122699387,
-.24478272641769091566565919038112042471760,    .7828746177370030581039755351681957186544,
-.24783616390458124145723672882013488560910,    .7804878048780487804878048780487804878049,
-.25088030628580937353433455427875742316250,    .7781155015197568389057750759878419452888,
-.25391520998096339667426946107298135757450,    .7757575757575757575757575757575757575758,
-.25694093089750041913887912414793390780680,    .7734138972809667673716012084592145015106,
-.25995752443692604627401010475296061486000,    .7710843373493975903614457831325301204819,
-.26296504550088134477547896494797896593800,    .7687687687687687687687687687687687687688,
-.26596354849713793599974565040611196309330,    .7664670658682634730538922155688622754491,
-.26895308734550393836570947314612567424780,    .7641791044776119402985074626865671641791,
-.27193371548364175804834985683555714786050,    .7619047619047619047619047619047619047619,
-.27490548587279922676529508862586226314300,    .7596439169139465875370919881305637982196,
-.27786845100345625159121709657483734190480,    .7573964497041420118343195266272189349112,
-.28082266290088775395616949026589281857030,    .7551622418879056047197640117994100294985,
-.28376817313064456316240580235898960381750,    .7529411764705882352941176470588235294118,
-.28670503280395426282112225635501090437180,    .7507331378299120234604105571847507331378,
-.28963329258304265634293983566749375313530,    .7485380116959064327485380116959064327485,
-.29255300268637740579436012922087684273730,    .7463556851311953352769679300291545189504,
-.29546421289383584252163927885703742504130,    .7441860465116279069767441860465116279070,
-.29836697255179722709783618483925238251680,    .7420289855072463768115942028985507246377,
-.30126133057816173455023545102449133992200,    .7398843930635838150289017341040462427746,
-.30414733546729666446850615102448500692850,    .7377521613832853025936599423631123919308,
-.30702503529491181888388950937951449304830,    .7356321839080459770114942528735632183908,
-.30989447772286465854207904158101882785550,    .7335243553008595988538681948424068767908,
-.31275571000389684739317885942000430077330,    .7314285714285714285714285714285714285714,
-.31560877898630329552176476681779604405180,    .7293447293447293447293447293447293447293,
-.31845373111853458869546784626436419785030,    .7272727272727272727272727272727272727273,
-.32129061245373424782201254856772720813750,    .7252124645892351274787535410764872521246,
-.32411946865421192853773391107097268104550,    .7231638418079096045197740112994350282486,
-.32694034499585328257253991068864706903700,    .7211267605633802816901408450704225352113,
-.32975328637246797969240219572384376078850,    .7191011235955056179775280898876404494382,
-.33255833730007655635318997155991382896900,    .7170868347338935574229691876750700280112,
-.33535554192113781191153520921943709254280,    .7150837988826815642458100558659217877095,
-.33814494400871636381467055798566434532400,    .7130919220055710306406685236768802228412,
-.34092658697059319283795275623560883104800,    .7111111111111111111111111111111111111111,
-.34370051385331840121395430287520866841080,    .7091412742382271468144044321329639889197,
-.34646676734620857063262633346312213689100,    .7071823204419889502762430939226519337017,
-.34922538978528827602332285096053965389730,    .7052341597796143250688705234159779614325,
-.35197642315717814209818925519357435405250,    .7032967032967032967032967032967032967033,
-.35471990910292899856770532096561510115850,    .7013698630136986301369863013698630136986,
-.35745588892180374385176833129662554711100,    .6994535519125683060109289617486338797814,
-.36018440357500774995358483465679455548530,    .6975476839237057220708446866485013623978,
-.36290549368936841911903457003063522279280,    .6956521739130434782608695652173913043478,
-.36561919956096466943762379742111079394830,    .6937669376693766937669376693766937669377,
-.36832556115870762614150635272380895912650,    .6918918918918918918918918918918918918919,
-.37102461812787262962487488948681857436900,    .6900269541778975741239892183288409703504,
-.37371640979358405898480555151763837784530,    .6881720430107526881720430107526881720430,
-.37640097516425302659470730759494472295050,    .6863270777479892761394101876675603217158,
-.37907835293496944251145919224654790014030,    .6844919786096256684491978609625668449198,
-.38174858149084833769393299007788300514230,    .6826666666666666666666666666666666666667,
-.38441169891033200034513583887019194662580,    .6808510638297872340425531914893617021277,
-.38706774296844825844488013899535872042180,    .6790450928381962864721485411140583554377,
-.38971675114002518602873692543653305619950,    .6772486772486772486772486772486772486772,
-.39235876060286384303665840889152605086580,    .6754617414248021108179419525065963060686,
-.39499380824086893770896722344332374632350,    .6736842105263157894736842105263157894737,
-.39762193064713846624158577469643205404280,    .6719160104986876640419947506561679790026,
-.40024316412701266276741307592601515352730,    .6701570680628272251308900523560209424084,
-.40285754470108348090917615991202183067800,    .6684073107049608355091383812010443864230,
-.40546510810816432934799991016916465014230,    .6666666666666666666666666666666666666667,
-.40806588980822172674223224930756259709600,    .6649350649350649350649350649350649350649,
-.41065992498526837639616360320360399782650,    .6632124352331606217616580310880829015544,
-.41324724855021932601317757871584035456180,    .6614987080103359173126614987080103359173,
-.41582789514371093497757669865677598863850,    .6597938144329896907216494845360824742268,
-.41840189913888381489925905043492093682300,    .6580976863753213367609254498714652956298,
-.42096929464412963239894338585145305842150,    .6564102564102564102564102564102564102564,
-.42353011550580327293502591601281892508280,    .6547314578005115089514066496163682864450,
-.42608439531090003260516141381231136620050,    .6530612244897959183673469387755102040816,
-.42863216738969872610098832410585600882780,    .6513994910941475826972010178117048346056,
-.43117346481837132143866142541810404509300,    .6497461928934010152284263959390862944162,
-.43370832042155937902094819946796633303180,    .6481012658227848101265822784810126582278,
-.43623676677491801667585491486534010618930,    .6464646464646464646464646464646464646465,
-.43875883620762790027214350629947148263450,    .6448362720403022670025188916876574307305,
-.44127456080487520440058801796112675219780,    .6432160804020100502512562814070351758794,
-.44378397241030093089975139264424797147500,    .6416040100250626566416040100250626566416,
-.44628710262841947420398014401143882423650,    .6400000000000000000000000000000000000000,
-.44878398282700665555822183705458883196130,    .6384039900249376558603491271820448877805,
-.45127464413945855836729492693848442286250,    .6368159203980099502487562189054726368159,
-.45375911746712049854579618113348260521900,    .6352357320099255583126550868486352357320,
-.45623743348158757315857769754074979573500,    .6336633663366336633663366336633663366337,
-.45870962262697662081833982483658473938700,    .6320987654320987654320987654320987654321,
-.46117571512217014895185229761409573256980,    .6305418719211822660098522167487684729064,
-.46363574096303250549055974261136725544930,    .6289926289926289926289926289926289926290,
-.46608972992459918316399125615134835243230,    .6274509803921568627450980392156862745098,
-.46853771156323925639597405279346276074650,    .6259168704156479217603911980440097799511,
-.47097971521879100631480241645476780831830,    .6243902439024390243902439024390243902439,
-.47341577001667212165614273544633761048330,    .6228710462287104622871046228710462287105,
-.47584590486996386493601107758877333253630,    .6213592233009708737864077669902912621359,
-.47827014848147025860569669930555392056700,    .6198547215496368038740920096852300242131,
-.48068852934575190261057286988943815231330,    .6183574879227053140096618357487922705314,
-.48310107575113581113157579238759353756900,    .6168674698795180722891566265060240963855,
-.48550781578170076890899053978500887751580,    .6153846153846153846153846153846153846154,
-.48790877731923892879351001283794175833480,    .6139088729016786570743405275779376498801,
-.49030398804519381705802061333088204264650,    .6124401913875598086124401913875598086124,
-.49269347544257524607047571407747454941280,    .6109785202863961813842482100238663484487,
-.49507726679785146739476431321236304938800,    .6095238095238095238095238095238095238095,
-.49745538920281889838648226032091770321130,    .6080760095011876484560570071258907363420,
-.49982786955644931126130359189119189977650,    .6066350710900473933649289099526066350711,
-.50219473456671548383667413872899487614650,    .6052009456264775413711583924349881796690,
-.50455601075239520092452494282042607665050,    .6037735849056603773584905660377358490566,
-.50691172444485432801997148999362252652650,    .6023529411764705882352941176470588235294,
-.50926190178980790257412536448100581765150,    .6009389671361502347417840375586854460094,
-.51160656874906207391973111953120678663250,    .5995316159250585480093676814988290398126,
-.51394575110223428282552049495279788970950,    .5981308411214953271028037383177570093458,
-.51627947444845445623684554448118433356300,    .5967365967365967365967365967365967365967,
-.51860776420804555186805373523384332656850,    .5953488372093023255813953488372093023256,
-.52093064562418522900344441950437612831600,    .5939675174013921113689095127610208816705,
-.52324814376454775732838697877014055848100,    .5925925925925925925925925925925925925926,
-.52556028352292727401362526507000438869000,    .5912240184757505773672055427251732101617,
-.52786708962084227803046587723656557500350,    .5898617511520737327188940092165898617512,
-.53016858660912158374145519701414741575700,    .5885057471264367816091954022988505747126,
-.53246479886947173376654518506256863474850,    .5871559633027522935779816513761467889908,
-.53475575061602764748158733709715306758900,    .5858123569794050343249427917620137299771,
-.53704146589688361856929077475797384977350,    .5844748858447488584474885844748858447489,
-.53932196859560876944783558428753167390800,    .5831435079726651480637813211845102505695,
-.54159728243274429804188230264117009937750,    .5818181818181818181818181818181818181818,
-.54386743096728351609669971367111429572100,    .5804988662131519274376417233560090702948,
-.54613243759813556721383065450936555862450,    .5791855203619909502262443438914027149321,
-.54839232556557315767520321969641372561450,    .5778781038374717832957110609480812641084,
-.55064711795266219063194057525834068655950,    .5765765765765765765765765765765765765766,
-.55289683768667763352766542084282264113450,    .5752808988764044943820224719101123595506,
-.55514150754050151093110798683483153581600,    .5739910313901345291479820627802690582960,
-.55738115013400635344709144192165695130850,    .5727069351230425055928411633109619686801,
-.55961578793542265941596269840374588966350,    .5714285714285714285714285714285714285714,
-.56184544326269181269140062795486301183700,    .5701559020044543429844097995545657015590,
-.56407013828480290218436721261241473257550,    .5688888888888888888888888888888888888889,
-.56628989502311577464155334382667206227800,    .5676274944567627494456762749445676274945,
-.56850473535266865532378233183408156037350,    .5663716814159292035398230088495575221239,
-.57071468100347144680739575051120482385150,    .5651214128035320088300220750551876379691,
-.57291975356178548306473885531886480748650,    .5638766519823788546255506607929515418502,
-.57511997447138785144460371157038025558000,    .5626373626373626373626373626373626373626,
-.57731536503482350219940144597785547375700,    .5614035087719298245614035087719298245614,
-.57950594641464214795689713355386629700650,    .5601750547045951859956236323851203501094,
-.58169173963462239562716149521293118596100,    .5589519650655021834061135371179039301310,
-.58387276558098266665552955601015128195300,    .5577342047930283224400871459694989106754,
-.58604904500357812846544902640744112432000,    .5565217391304347826086956521739130434783,
-.58822059851708596855957011939608491957200,    .5553145336225596529284164859002169197397,
-.59038744660217634674381770309992134571100,    .5541125541125541125541125541125541125541,
-.59254960960667157898740242671919986605650,    .5529157667386609071274298056155507559395,
-.59470710774669277576265358220553025603300,    .5517241379310344827586206896551724137931,
-.59685996110779382384237123915227130055450,    .5505376344086021505376344086021505376344,
-.59900818964608337768851242799428291618800,    .5493562231759656652360515021459227467811,
-.60115181318933474940990890900138765573500,    .5481798715203426124197002141327623126338,
-.60329085143808425240052883964381180703650,    .5470085470085470085470085470085470085470,
-.60542532396671688843525771517306566238400,    .5458422174840085287846481876332622601279,
-.60755525022454170969155029524699784815300,    .5446808510638297872340425531914893617021,
-.60968064953685519036241657886421307921400,    .5435244161358811040339702760084925690021,
-.61180154110599282990534675263916142284850,    .5423728813559322033898305084745762711864,
-.61391794401237043121710712512140162289150,    .5412262156448202959830866807610993657505,
-.61602987721551394351138242200249806046500,    .5400843881856540084388185654008438818565,
-.61813735955507864705538167982012964785100,    .5389473684210526315789473684210526315789,
-.62024040975185745772080281312810257077200,    .5378151260504201680672268907563025210084,
-.62233904640877868441606324267922900617100,    .5366876310272536687631027253668763102725,
-.62443328801189346144440150965237990021700,    .5355648535564853556485355648535564853556,
-.62652315293135274476554741340805776417250,    .5344467640918580375782881002087682672234,
-.62860865942237409420556559780379757285100,    .5333333333333333333333333333333333333333,
-.63068982562619868570408243613201193511500,    .5322245322245322245322245322245322245322,
-.63276666957103777644277897707070223987100,    .5311203319502074688796680497925311203320,
-.63483920917301017716738442686619237065300,    .5300207039337474120082815734989648033126,
-.63690746223706917739093569252872839570050,    .5289256198347107438016528925619834710744,
-.63897144645792069983514238629140891134750,    .5278350515463917525773195876288659793814,
-.64103117942093124081992527862894348800200,    .5267489711934156378600823045267489711934,
-.64308667860302726193566513757104985415950,    .5256673511293634496919917864476386036961,
-.64513796137358470073053240412264131009600,    .5245901639344262295081967213114754098361,
-.64718504499530948859131740391603671014300,    .5235173824130879345603271983640081799591,
-.64922794662510974195157587018911726772800,    .5224489795918367346938775510204081632653,
-.65126668331495807251485530287027359008800,    .5213849287169042769857433808553971486762,
-.65330127201274557080523663898929953575150,    .5203252032520325203252032520325203252033,
-.65533172956312757406749369692988693714150,    .5192697768762677484787018255578093306288,
-.65735807270835999727154330685152672231200,    .5182186234817813765182186234817813765182,
-.65938031808912778153342060249997302889800,    .5171717171717171717171717171717171717172,
-.66139848224536490484126716182800009846700,    .5161290322580645161290322580645161290323,
-.66341258161706617713093692145776003599150,    .5150905432595573440643863179074446680080,
-.66542263254509037562201001492212526500250,    .5140562248995983935742971887550200803213,
-.66742865127195616370414654738851822912700,    .5130260521042084168336673346693386773547,
-.66943065394262923906154583164607174694550,    .5120000000000000000000000000000000000000,
-.67142865660530226534774556057527661323550,    .5109780439121756487025948103792415169661,
-.67342267521216669923234121597488410770900,    .5099601593625498007968127490039840637450,
-.67541272562017662384192817626171745359900,    .5089463220675944333996023856858846918489,
-.67739882359180603188519853574689477682100,    .5079365079365079365079365079365079365079,
-.67938098479579733801614338517538271844400,    .5069306930693069306930693069306930693069,
-.68135922480790300781450241629499942064300,    .5059288537549407114624505928853754940711,
-.68333355911162063645036823800182901322850,    .5049309664694280078895463510848126232742,
-.68530400309891936760919861626462079584600,    .5039370078740157480314960629921259842520,
-.68727057207096020619019327568821609020250,    .5029469548133595284872298624754420432220,
-.68923328123880889251040571252815425395950,    .5019607843137254901960784313725490196078,
-.69314718055994530941723212145818, 5.0e-01,
-};
-
-
-
-#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
-static const double ln_2 = 0.69314718055994530941723212145818;
-
-static void Log_32f( const float *_x, float *y, int n )
-{
-    static const float shift[] = { 0, -1.f/512 };
-    static const float
-        A0 = 0.3333333333333333333333333f,
-        A1 = -0.5f,
-        A2 = 1.f;
-
-    #undef LOGPOLY
-    #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
-
-    int i = 0;
-    Cv32suf buf[4];
-    const int* x = (const int*)_x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-        static const __m128 _1_4 = _mm_set1_ps(1.f);
-        static const __m128 shift4 = _mm_set1_ps(-1.f/512);
-
-        static const __m128 mA0 = _mm_set1_ps(A0);
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-
-        int CV_DECL_ALIGNED(16) idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-            __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
-            __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-            __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
-
-            __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
-
-            h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
-            _mm_store_si128((__m128i*)idx, h0);
-            h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-            __m128d t0, t1, t2, t3, t4;
-            t0 = _mm_load_pd(icvLogTab + idx[0]);
-            t2 = _mm_load_pd(icvLogTab + idx[1]);
-            t1 = _mm_unpackhi_pd(t0, t2);
-            t0 = _mm_unpacklo_pd(t0, t2);
-            t2 = _mm_load_pd(icvLogTab + idx[2]);
-            t4 = _mm_load_pd(icvLogTab + idx[3]);
-            t3 = _mm_unpackhi_pd(t2, t4);
-            t2 = _mm_unpacklo_pd(t2, t4);
-
-            yd0 = _mm_add_pd(yd0, t0);
-            yd1 = _mm_add_pd(yd1, t2);
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-
-            __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
-            xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
-            xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
-
-            __m128 zf0 = _mm_mul_ps(xf0, mA0);
-            zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
-            zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
-            yf0 = _mm_add_ps(yf0, zf0);
-
-            _mm_storeu_ps(y + i, yf0);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = x[i];
-        h1 = x[i+1];
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
-
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = x[i+2];
-        h3 = x[i+3];
-
-        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
-
-        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
-        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
-
-        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
-
-        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
-
-        x0 += shift[h0 == 510];
-        x1 += shift[h1 == 510];
-        y0 += LOGPOLY( x0 );
-        y1 += LOGPOLY( x1 );
-
-        y[i] = (float) y0;
-        y[i + 1] = (float) y1;
-
-        x2 += shift[h2 == 510];
-        x3 += shift[h3 == 510];
-        y2 += LOGPOLY( x2 );
-        y3 += LOGPOLY( x3 );
-
-        y[i + 2] = (float) y2;
-        y[i + 3] = (float) y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        int h0 = x[i];
-        double y0;
-        float x0;
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x0 += shift[h0 == 510];
-        y0 += LOGPOLY( x0 );
-
-        y[i] = (float)y0;
-    }
-}
-
-
-static void Log_64f( const double *x, double *y, int n )
-{
-    static const double shift[] = { 0, -1./512 };
-    static const double
-        A7 = 1.0,
-        A6 = -0.5,
-        A5 = 0.333333333333333314829616256247390992939472198486328125,
-        A4 = -0.25,
-        A3 = 0.2,
-        A2 = -0.1666666666666666574148081281236954964697360992431640625,
-        A1 = 0.1428571428571428769682682968777953647077083587646484375,
-        A0 = -0.125;
-
-    #undef LOGPOLY
-    #define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
-        (((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
-        (((A1*xq + A3)*xq + A5)*xq + A7)*(x))
-
-    int i = 0;
-    DBLINT buf[4];
-    DBLINT *X = (DBLINT *) x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-        static const __m128d _1_2 = _mm_set1_pd(1.);
-        static const __m128d shift2 = _mm_set1_pd(-1./512);
-
-        static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
-        static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
-
-        static const __m128d mA0 = _mm_set1_pd(A0);
-        static const __m128d mA1 = _mm_set1_pd(A1);
-        static const __m128d mA2 = _mm_set1_pd(A2);
-        static const __m128d mA3 = _mm_set1_pd(A3);
-        static const __m128d mA4 = _mm_set1_pd(A4);
-        static const __m128d mA5 = _mm_set1_pd(A5);
-        static const __m128d mA6 = _mm_set1_pd(A6);
-        static const __m128d mA7 = _mm_set1_pd(A7);
-
-        int CV_DECL_ALIGNED(16) idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-            __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
-
-            __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
-            __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
-
-            h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
-
-            __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
-                                    _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
-            __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-            __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
-
-            h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
-            _mm_store_si128((__m128i*)idx, h0);
-            h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-            __m128d t0, t1, t2, t3, t4;
-            t0 = _mm_load_pd(icvLogTab + idx[0]);
-            t2 = _mm_load_pd(icvLogTab + idx[1]);
-            t1 = _mm_unpackhi_pd(t0, t2);
-            t0 = _mm_unpacklo_pd(t0, t2);
-            t2 = _mm_load_pd(icvLogTab + idx[2]);
-            t4 = _mm_load_pd(icvLogTab + idx[3]);
-            t3 = _mm_unpackhi_pd(t2, t4);
-            t2 = _mm_unpacklo_pd(t2, t4);
-
-            yd0 = _mm_add_pd(yd0, t0);
-            yd1 = _mm_add_pd(yd1, t2);
-
-            xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
-            xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
-
-            xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
-            xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
-
-            __m128d zd0 = _mm_mul_pd(xd0, mA0);
-            __m128d zd1 = _mm_mul_pd(xd1, mA0);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
-
-            yd0 = _mm_add_pd(yd0, zd0);
-            yd1 = _mm_add_pd(yd1, zd1);
-
-            _mm_storeu_pd(y + i, yd0);
-            _mm_storeu_pd(y + i + 2, yd1);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double xq;
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = X[i].i.lo;
-        h1 = X[i + 1].i.lo;
-        buf[0].i.lo = h0;
-        buf[1].i.lo = h1;
-
-        h0 = X[i].i.hi;
-        h1 = X[i + 1].i.hi;
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
-
-        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = X[i + 2].i.lo;
-        h3 = X[i + 3].i.lo;
-        buf[2].i.lo = h2;
-        buf[3].i.lo = h3;
-
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = X[i + 2].i.hi;
-        h3 = X[i + 3].i.hi;
-
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
-
-        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
-        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
-
-        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
-        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
-
-        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
-
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y1 += LOGPOLY( x1, h1 == 510 );
-
-        y[i] = y0;
-        y[i + 1] = y1;
-
-        y2 += LOGPOLY( x2, h2 == 510 );
-        y3 += LOGPOLY( x3, h3 == 510 );
-
-        y[i + 2] = y2;
-        y[i + 3] = y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        int h0 = X[i].i.hi;
-        double xq;
-        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[0].i.lo = X[i].i.lo;
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y[i] = y0;
-    }
-}
-
 #ifdef HAVE_IPP
 static void Log_32f_ipp(const float *x, float *y, int n)
 {
@@ -2256,7 +826,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
         }
         setIppErrorStatus();
     }
-    Log_32f(x, y, n);
+    hal::log(x, y, n);
 }
 
 static void Log_64f_ipp(const double *x, double *y, int n)
@@ -2270,11 +840,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
         }
         setIppErrorStatus();
     }
-    Log_64f(x, y, n);
+    hal::log(x, y, n);
 }
 
 #define Log_32f Log_32f_ipp
 #define Log_64f Log_64f_ipp
+#else
+#define Log_32f hal::log
+#define Log_64f hal::log
 #endif
 
 void log( InputArray _src, OutputArray _dst )
@@ -2651,6 +1224,11 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
 
 #endif
 
+static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
+static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
+static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
+static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
+
 void pow( InputArray _src, double power, OutputArray _dst )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type),
@@ -3085,27 +1663,6 @@ void patchNaNs( InputOutputArray _a, double _val )
     }
 }
 
-
-void exp(const float* src, float* dst, int n)
-{
-    Exp_32f(src, dst, n);
-}
-
-void log(const float* src, float* dst, int n)
-{
-    Log_32f(src, dst, n);
-}
-
-void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees)
-{
-    FastAtan2_32f(y, x, dst, n, angleInDegrees);
-}
-
-void magnitude(const float* x, const float* y, float* dst, int n)
-{
-    Magnitude_32f(x, y, dst, n);
-}
-
 }
 
 CV_IMPL float cvCbrt(float value) { return cv::cubeRoot(value); }
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index c1d2c34852..65b93890ec 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1113,7 +1113,7 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
                                         Input/Output Array
 \*************************************************************************************************/
 
-Mat _InputArray::getMat(int i) const
+Mat _InputArray::getMat_(int i) const
 {
     int k = kind();
     int accessFlags = flags & ACCESS_MASK;
diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 89919715ec..343fd0a0e1 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -43,9 +43,9 @@
 
 #include "precomp.hpp"
 
-namespace
+namespace cv
 {
-    class FormattedImpl : public cv::Formatted
+    class FormattedImpl : public Formatted
     {
         enum { STATE_PROLOGUE, STATE_EPILOGUE, STATE_INTERLUDE,
                STATE_ROW_OPEN, STATE_ROW_CLOSE, STATE_CN_OPEN, STATE_CN_CLOSE, STATE_VALUE, STATE_FINISHED,
@@ -55,7 +55,7 @@ namespace
         char floatFormat[8];
         char buf[32];   // enough for double with precision up to 20
 
-        cv::Mat mtx;
+        Mat mtx;
         int mcn; // == mtx.channels()
         bool singleLine;
         bool alignOrder;    // true when cn first order
@@ -65,8 +65,8 @@ namespace
         int col;
         int cn;
 
-        cv::String prologue;
-        cv::String epilogue;
+        String prologue;
+        String epilogue;
         char braces[5];
 
         void (FormattedImpl::*valueToStr)();
@@ -81,7 +81,7 @@ namespace
 
     public:
 
-        FormattedImpl(cv::String pl, cv::String el, cv::Mat m, char br[5], bool sLine, bool aOrder, int precision)
+        FormattedImpl(String pl, String el, Mat m, char br[5], bool sLine, bool aOrder, int precision)
         {
             CV_Assert(m.dims <= 2);
 
@@ -253,7 +253,7 @@ namespace
         }
     };
 
-    class FormatterBase : public cv::Formatter
+    class FormatterBase : public Formatter
     {
     public:
         FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
@@ -278,14 +278,15 @@ namespace
         int prec64f;
         int multiline;
     };
+
     class DefaultFormatter : public FormatterBase
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -294,10 +295,10 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("", "", mtx, &*braces,
+            return makePtr<FormattedImpl>("", "", mtx, &*braces,
                 mtx.rows == 1 || !multiline, true, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -306,12 +307,12 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -320,17 +321,17 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             static const char* numpyTypes[] =
             {
                 "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
             };
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("array([",
-                cv::format("], type='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
+            return makePtr<FormattedImpl>("array([",
+                cv::format("], dtype='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -339,11 +340,11 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', '\0', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>(cv::String(),
-                mtx.rows > 1 ? cv::String("\n") : cv::String(), mtx, &*braces,
+            return makePtr<FormattedImpl>(String(),
+                mtx.rows > 1 ? String("\n") : String(), mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -352,19 +353,14 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ',', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("{", "}", mtx, &*braces,
+            return makePtr<FormattedImpl>("{", "}", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
 
-} // namespace
-
-
-namespace cv
-{
     Formatted::~Formatted() {}
     Formatter::~Formatter() {}
 
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index fa61203710..e5007e5d17 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -55,6 +55,8 @@
 #include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/ocl.hpp"
 
+#include "opencv2/hal.hpp"
+
 #include <assert.h>
 #include <ctype.h>
 #include <float.h>
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 41218904f8..e43df94448 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2416,274 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {
 
-float normL2Sqr_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
-
-float normL1_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-        __m128 absmask = _mm_load_ps((const float*)absbuf);
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
-            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-int normL1_(const uchar* a, const uchar* b, int n)
-{
-    int j = 0, d = 0;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128i d0 = _mm_setzero_si128();
-
-        for( ; j <= n - 16; j += 16 )
-        {
-            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
-            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-
-        for( ; j <= n - 4; j += 4 )
-        {
-            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
-            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
-    }
-    else
-#elif CV_NEON
-    uint32x4_t v_sum = vdupq_n_u32(0.0f);
-    for ( ; j <= n - 16; j += 16)
-    {
-        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
-        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
-    }
-
-    uint CV_DECL_ALIGNED(16) buf[4];
-    vst1q_u32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-static const uchar popCountTable[] =
-{
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
-static const uchar popCountTable2[] =
-{
-    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
-};
-
-static const uchar popCountTable4[] =
-{
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-};
-
-static int normHamming(const uchar* a, int n)
-{
-    int i = 0, result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t bitsSet = vcntq_u8 (A_vec);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
-            popCountTable[a[i+2]] + popCountTable[a[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n)
-{
-    int i = 0, result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t B_vec = vld1q_u8 (b + i);
-            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-            uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
-                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i] ^ b[i]];
-    return result;
-}
-
-static int normHamming(const uchar* a, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
-    int i = 0, result = 0;
-#if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
-#endif
-    for( ; i < n; i++ )
-        result += tab[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, b, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
-    int i = 0, result = 0;
-    #if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
-                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
-    #endif
-    for( ; i < n; i++ )
-        result += tab[a[i] ^ b[i]];
-    return result;
-}
-
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -2698,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
             if( mask[i] )
             {
                 for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(cv_abs(src[k])));
             }
     }
     *_result = result;
@@ -2719,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
             if( mask[i] )
             {
                 for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += cv_abs(src[k]);
             }
     }
     *_result = result;
@@ -2816,6 +2548,10 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
     return 0;
 }
 
+Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
+{
+    return cv::hal::normHamming(a, b, size);
+}
 
 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
@@ -3164,10 +2900,14 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                 const uchar* data = src.ptr<uchar>();
 
                 if( normType == NORM_HAMMING )
-                    return normHamming(data, (int)len);
+                {
+                    return hal::normHamming(data, (int)len);
+                }
 
                 if( normType == NORM_HAMMING2 )
-                    return normHamming(data, (int)len, 2);
+                {
+                    return hal::normHamming(data, (int)len, 2);
+                }
             }
         }
     }
@@ -3191,7 +2931,9 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
         int result = 0;
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
-            result += normHamming(ptrs[0], total, cellSize);
+        {
+            result += hal::normHamming(ptrs[0], total, cellSize);
+        }
 
         return result;
     }
@@ -3673,7 +3415,9 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
         int result = 0;
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
-            result += normHamming(ptrs[0], ptrs[1], total, cellSize);
+        {
+            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
+        }
 
         return result;
     }
@@ -3810,13 +3554,18 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
     if( !mask )
     {
         for( int i = 0; i < nvecs; i++ )
-            dist[i] = normHamming(src1, src2 + step2*i, len);
+             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
     }
     else
     {
         int val0 = INT_MAX;
         for( int i = 0; i < nvecs; i++ )
-            dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len) : val0;
+        {
+            if (mask[i])
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
+            else
+                dist[i] = val0;
+        }
     }
 }
 
@@ -3827,13 +3576,18 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
     if( !mask )
     {
         for( int i = 0; i < nvecs; i++ )
-            dist[i] = normHamming(src1, src2 + step2*i, len, 2);
+            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
     }
     else
     {
         int val0 = INT_MAX;
         for( int i = 0; i < nvecs; i++ )
-            dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len, 2) : val0;
+        {
+            if (mask[i])
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
+            else
+                dist[i] = val0;
+        }
     }
 }
 
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 37bff62151..0c401f8ebd 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -144,7 +144,11 @@ protected:
 
             depth = cvtest::randInt(rng) % (CV_64F+1);
             cn = cvtest::randInt(rng) % 4 + 1;
-            int sz[] = {cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1};
+            int sz[] = {
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+            };
             MatND test_mat_nd(3, sz, CV_MAKETYPE(depth, cn));
 
             rng0.fill(test_mat_nd, CV_RAND_UNI, Scalar::all(ranges[depth][0]), Scalar::all(ranges[depth][1]));
@@ -156,8 +160,12 @@ protected:
                 multiply(test_mat_nd, test_mat_scale, test_mat_nd);
             }
 
-            int ssz[] = {cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1,
-                cvtest::randInt(rng)%10+1,cvtest::randInt(rng)%10+1};
+            int ssz[] = {
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+            };
             SparseMat test_sparse_mat = cvTsGetRandomSparseMat(4, ssz, cvtest::randInt(rng)%(CV_64F+1),
                                                                cvtest::randInt(rng) % 10000, 0, 100, rng);
 
diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp
index 36662418c3..6ce547e60e 100644
--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -253,7 +253,7 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpAffine,
     const double aplha = CV_PI / 4;
     const double mat[2 * 3] =
     {
-        std::cos(aplha), -std::sin(aplha), src.cols / 2,
+        std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0,
         std::sin(aplha),  std::cos(aplha), 0
     };
     const cv::Mat M(2, 3, CV_64F, (void*) mat);
@@ -301,7 +301,7 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpPerspective,
     declare.in(src, WARMUP_RNG);
 
     const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
+    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0},
                          {std::sin(aplha),  std::cos(aplha), 0},
                          {0.0,              0.0,             1.0}};
     const cv::Mat M(3, 3, CV_64F, (void*) mat);
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index f942c23d66..cf95e7d343 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -730,38 +730,6 @@ struct CV_EXPORTS L1
     }
 };
 
-/*
- * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
- * bit count of A exclusive XOR'ed with B
- */
-struct CV_EXPORTS Hamming
-{
-    enum { normType = NORM_HAMMING };
-    typedef unsigned char ValueType;
-    typedef int ResultType;
-
-    /** this will count the bits in a ^ b
-     */
-    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const
-    {
-        return normHamming(a, b, size);
-    }
-};
-
-typedef Hamming HammingLUT;
-
-template<int cellsize> struct HammingMultilevel
-{
-    enum { normType = NORM_HAMMING + (cellsize>1) };
-    typedef unsigned char ValueType;
-    typedef int ResultType;
-
-    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const
-    {
-        return normHamming(a, b, size, cellsize);
-    }
-};
-
 /****************************************************************************************\
 *                                  DescriptorMatcher                                     *
 \****************************************************************************************/
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index fd15345b29..d12656e994 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
       }
     }
   }
-  fastAtan2(resY, resX, Ang, ang_size, false);
+  hal::fastAtan2(resY, resX, Ang, ang_size, false);
   // Loop slides pi/3 window around feature point
   for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
     ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
diff --git a/modules/flann/include/opencv2/flann/miniflann.hpp b/modules/flann/include/opencv2/flann/miniflann.hpp
index f2acc23bff..02fa236d3a 100644
--- a/modules/flann/include/opencv2/flann/miniflann.hpp
+++ b/modules/flann/include/opencv2/flann/miniflann.hpp
@@ -89,13 +89,13 @@ struct CV_EXPORTS LinearIndexParams : public IndexParams
 struct CV_EXPORTS CompositeIndexParams : public IndexParams
 {
     CompositeIndexParams(int trees = 4, int branching = 32, int iterations = 11,
-                         cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2 );
+                         cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
 };
 
 struct CV_EXPORTS AutotunedIndexParams : public IndexParams
 {
-    AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01,
-                         float memory_weight = 0, float sample_fraction = 0.1);
+    AutotunedIndexParams(float target_precision = 0.8f, float build_weight = 0.01f,
+                         float memory_weight = 0, float sample_fraction = 0.1f);
 };
 
 struct CV_EXPORTS HierarchicalClusteringIndexParams : public IndexParams
@@ -107,7 +107,7 @@ struct CV_EXPORTS HierarchicalClusteringIndexParams : public IndexParams
 struct CV_EXPORTS KMeansIndexParams : public IndexParams
 {
     KMeansIndexParams(int branching = 32, int iterations = 11,
-                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2 );
+                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
 };
 
 struct CV_EXPORTS LshIndexParams : public IndexParams
diff --git a/modules/hal/CMakeLists.txt b/modules/hal/CMakeLists.txt
new file mode 100644
index 0000000000..b5b2abb81e
--- /dev/null
+++ b/modules/hal/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(the_description "The Hardware Acceleration Layer (HAL) module")
+
+set(OPENCV_MODULE_TYPE STATIC)
+# set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  endif()
+endif()
+
+ocv_define_module(hal)
diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp
new file mode 100644
index 0000000000..95d1ac66c3
--- /dev/null
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_HPP__
+#define __OPENCV_HAL_HPP__
+
+#include "opencv2/hal/defs.h"
+
+/**
+  @defgroup hal Hardware Acceleration Layer
+*/
+
+namespace cv { namespace hal {
+
+namespace Error {
+
+enum
+{
+    Ok = 0,
+    Unknown = -1
+};
+
+}
+
+int normHamming(const uchar* a, int n);
+int normHamming(const uchar* a, const uchar* b, int n);
+
+int normHamming(const uchar* a, int n, int cellSize);
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+//////////////////////////////// low-level functions ////////////////////////////////
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+int normL1_(const uchar* a, const uchar* b, int n);
+float normL1_(const float* a, const float* b, int n);
+float normL2Sqr_(const float* a, const float* b, int n);
+
+void exp(const float* src, float* dst, int n);
+void exp(const double* src, double* dst, int n);
+void log(const float* src, float* dst, int n);
+void log(const double* src, double* dst, int n);
+
+void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+void magnitude(const float* x, const float* y, float* dst, int n);
+void magnitude(const double* x, const double* y, double* dst, int n);
+void sqrt(const float* src, float* dst, int len);
+void sqrt(const double* src, double* dst, int len);
+void invSqrt(const float* src, float* dst, int len);
+void invSqrt(const double* src, double* dst, int len);
+
+}} //cv::hal
+
+#endif //__OPENCV_HAL_HPP__
diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
new file mode 100644
index 0000000000..197533993b
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -0,0 +1,675 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DEF_H__
+#define __OPENCV_DEF_H__
+
+#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
+#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
+#endif
+
+#include <limits.h>
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+
+#define CV_CPU_NEON   100
+
+// when adding to this list remember to update the enum in core/utility.cpp
+#define CV_HARDWARE_MAX_FEATURE 255
+
+// do not include SSE/AVX/NEON headers for NVCC compiler
+#ifndef __CUDACC__
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <pmmintrin.h>
+#    define CV_SSE3 1
+#  endif
+#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <tmmintrin.h>
+#    define CV_SSSE3 1
+#  endif
+#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <smmintrin.h>
+#    define CV_SSE4_1 1
+#  endif
+#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <nmmintrin.h>
+#    define CV_SSE4_2 1
+#  endif
+#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    ifdef _MSC_VER
+#      include <nmmintrin.h>
+#    else
+#      include <popcntintrin.h>
+#    endif
+#    define CV_POPCNT 1
+#  endif
+#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
+// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
+// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
+#    include <immintrin.h>
+#    define CV_AVX 1
+#    if defined(_XCR_XFEATURE_ENABLED_MASK)
+#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+#    else
+#      define __xgetbv() 0
+#    endif
+#  endif
+#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
+#    include <immintrin.h>
+#    define CV_AVX2 1
+#    if defined __FMA__
+#      define CV_FMA3 1
+#    endif
+#  endif
+#endif
+
+#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include "arm_neon.h"
+# define CV_NEON 1
+# define CPU_HAS_NEON_FEATURE (true)
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__)
+#  define CV_VFP 1
+#endif
+
+#endif // __CUDACC__
+
+#ifndef CV_POPCNT
+#define CV_POPCNT 0
+#endif
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
+
+#ifndef CV_VFP
+#  define CV_VFP 0
+#endif
+
+/* primitive types */
+/*
+  schar  - signed 1 byte integer
+  uchar  - unsigned 1 byte integer
+  short  - signed 2 byte integer
+  ushort - unsigned 2 byte integer
+  int    - signed 4 byte integer
+  uint   - unsigned 4 byte integer
+  int64  - signed 8 byte integer
+  uint64 - unsigned 8 byte integer
+*/
+
+#if !defined _MSC_VER && !defined __BORLANDC__
+#  if defined __cplusplus && __cplusplus >= 201103L
+#    include <cstdint>
+     typedef std::uint32_t uint;
+#  else
+#    include <stdint.h>
+     typedef uint32_t uint;
+#  endif
+#else
+   typedef unsigned uint;
+#endif
+
+typedef signed char schar;
+
+#ifndef __IPL_H__
+   typedef unsigned char uchar;
+   typedef unsigned short ushort;
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__
+   typedef __int64 int64;
+   typedef unsigned __int64 uint64;
+#  define CV_BIG_INT(n)   n##I64
+#  define CV_BIG_UINT(n)  n##UI64
+#else
+   typedef int64_t int64;
+   typedef uint64_t uint64;
+#  define CV_BIG_INT(n)   n##LL
+#  define CV_BIG_UINT(n)  n##ULL
+#endif
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI 6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#if defined __BORLANDC__
+#  include <fastmath.h>
+#elif defined __cplusplus
+#  include <cmath>
+#else
+#  include <math.h>
+#endif
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#  include "tegra_round.hpp"
+#endif
+
+//! @addtogroup core_utils
+//! @{
+
+#if CV_VFP
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#endif // CV_VFP
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_DBL(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_DBL(value);
+# else
+    return (int)lrint(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+       the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
+      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_FLT(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_FLT(value);
+# else
+    return (int)lrintf(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+     the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+}
+
+#include <algorithm>
+
+namespace cv
+{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/**
+ Template function for accurate conversion from one primitive type to another.
+
+ The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
+ and others. They perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ This operation is used in the simplest or most complex image processing functions in OpenCV.
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+//! @cond IGNORED
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
+
+//! @endcond
+
+}
+
+#endif // __cplusplus
+
+//! @} core_utils
+
+#endif //__OPENCV_HAL_H__
diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
new file mode 100644
index 0000000000..439a04c74e
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_HPP__
+#define __OPENCV_HAL_INTRIN_HPP__
+
+#include <cmath>
+#include <float.h>
+#include <stdlib.h>
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+// unlike HAL API, which is in cv::hall,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
+
+template<typename _Tp> struct V_TypeTraits
+{
+    typedef _Tp int_type;
+    typedef _Tp uint_type;
+    typedef _Tp abs_type;
+    typedef _Tp sum_type;
+
+    enum { delta = 0, shift = 0 };
+
+    static int_type reinterpret_int(_Tp x) { return x; }
+    static uint_type reinterpet_uint(_Tp x) { return x; }
+    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
+};
+
+template<> struct V_TypeTraits<uchar>
+{
+    typedef uchar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef ushort w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<schar>
+{
+    typedef schar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef short w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<ushort>
+{
+    typedef ushort value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef unsigned w_type;
+    typedef uchar nu_type;
+
+    enum { delta = 32768, shift = 16 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<short>
+{
+    typedef short value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef int w_type;
+    typedef uchar nu_type;
+    typedef schar n_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<unsigned>
+{
+    typedef unsigned value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef unsigned sum_type;
+
+    typedef uint64 w_type;
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<int>
+{
+    typedef int value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef int sum_type;
+
+    typedef int64 w_type;
+    typedef short n_type;
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<uint64>
+{
+    typedef uint64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef uint64 sum_type;
+
+    typedef unsigned nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<int64>
+{
+    typedef int64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef int64 sum_type;
+
+    typedef int nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+
+template<> struct V_TypeTraits<float>
+{
+    typedef float value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef float abs_type;
+    typedef float sum_type;
+
+    typedef double w_type;
+
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv32suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+template<> struct V_TypeTraits<double>
+{
+    typedef double value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef double abs_type;
+    typedef double sum_type;
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv64suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+}
+
+#if CV_SSE2
+
+#include "opencv2/hal/intrin_sse.hpp"
+
+#elif CV_NEON
+
+#include "opencv2/hal/intrin_neon.hpp"
+
+#else
+
+#include "opencv2/hal/intrin_cpp.hpp"
+
+#endif
+
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
new file mode 100644
index 0000000000..e0140a8632
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@@ -0,0 +1,811 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
+#define __OPENCV_HAL_INTRIN_CPP_HPP__
+
+namespace cv
+{
+
+template<typename _Tp, int n> struct v_reg
+{
+    typedef _Tp lane_type;
+    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
+    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
+    enum { nlanes = n };
+
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    v_reg() {}
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+
+    _Tp get(const int i) const { return s[i]; }
+    _Tp get0() const { return s[0]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        memcpy(&c.s[0], &s[0], bytes);
+        return c;
+    }
+
+    _Tp s[n];
+};
+
+#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> \
+    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& \
+    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIN_OP(+)
+OPENCV_HAL_IMPL_BIN_OP(-)
+OPENCV_HAL_IMPL_BIN_OP(*)
+OPENCV_HAL_IMPL_BIN_OP(/)
+
+#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
+    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
+    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIT_OP(&)
+OPENCV_HAL_IMPL_BIT_OP(|)
+OPENCV_HAL_IMPL_BIT_OP(^)
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
+        return c;
+}
+
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
+                          typename V_TypeTraits<_Tp>::abs_type)
+OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
+
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
+
+template<typename _Tp, int n>
+inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+
+
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(==)
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i]*b.s[i]*2;
+        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
+                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
+    }
+}
+
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_SHIFT_OP(<<)
+OPENCV_HAL_IMPL_SHIFT_OP(>>)
+
+template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+n/2] = hiptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename
+    V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<typename _Tp>
+inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
+                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
+                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
+                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+{
+    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
+    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
+    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
+    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+}
+
+typedef v_reg<uchar, 16> v_uint8x16;
+typedef v_reg<schar, 16> v_int8x16;
+typedef v_reg<ushort, 8> v_uint16x8;
+typedef v_reg<short, 8> v_int16x8;
+typedef v_reg<unsigned, 4> v_uint32x4;
+typedef v_reg<int, 4> v_int32x4;
+typedef v_reg<float, 4> v_float32x4;
+typedef v_reg<float, 8> v_float32x8;
+typedef v_reg<double, 2> v_float64x2;
+typedef v_reg<uint64, 2> v_uint64x2;
+typedef v_reg<int64, 2> v_int64x2;
+
+#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
+inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+template<typename _Tp0, int n0> inline _Tpvec \
+    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
+{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
+
+OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
+
+#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return a << n; } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return a >> n; } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ \
+    _Tpvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
+
+
+#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
+inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
+    } \
+    return c; \
+} \
+template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    } \
+    return c; \
+} \
+inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
+} \
+template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+}
+
+OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
+                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
+                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
+                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+}
+
+}
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp
new file mode 100644
index 0000000000..ab6aa86315
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -0,0 +1,823 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
+#define __OPENCV_HAL_INTRIN_NEON_HPP__
+
+namespace cv
+{
+
+#define CV_SIMD128 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+
+    uint8x16_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+
+    int8x16_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+
+    uint16x8_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+
+    int16x8_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+
+    uint32x4_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
+    int32x4_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
+    float32x4_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(uint64x2_t v) : val(v) {}
+    v_uint64x2(unsigned v0, unsigned v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = vld1q_u64(v);
+    }
+    uint64 get0() const
+    {
+        return vgetq_lane_u64(val, 0);
+    }
+    uint64x2_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(int64x2_t v) : val(v) {}
+    v_int64x2(int v0, int v1)
+    {
+        int64 v[] = {v0, v1};
+        val = vld1q_s64(v);
+    }
+    int64 get0() const
+    {
+        return vgetq_lane_s64(val, 0);
+    }
+    int64x2_t val;
+};
+
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val); \
+    vst1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    vst1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    a.val = vmulq_f32(a.val, reciprocal);
+    return a;
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+    d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+    int32x4x2_t cd = vtrnq_s32(c, d);
+    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
+}
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(e);
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+
+
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+
+// TODO: absdiff for signed integers
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
+    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
+}
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp
new file mode 100644
index 0000000000..3b77a11542
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -0,0 +1,1544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_SSE_HPP__
+#define __OPENCV_HAL_SSE_HPP__
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+namespace cv
+{
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    ushort get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    uint64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    }
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    int64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    }
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+    __m128d val;
+};
+
+#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(cast(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
+
+inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
+inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
+inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
+inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
+
+template<typename _Tpvec> inline
+v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
+template<typename _Tpvec> inline
+v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
+{ return _Tpvec(_mm_castps_si128(a.val)); } \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
+{ return _Tpvec(_mm_castpd_si128(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
+                                       _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                     _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
+}
+
+
+// bit-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+}
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, b1);
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline __m128i v_sign_epi64(__m128i a)
+{
+    return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
+}
+
+inline __m128i v_srai_epi64(__m128i a, int imm)
+{
+    __m128i smask = v_sign_epi64(a);
+    return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
+}
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+
+inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
+}
+inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
+    c.val = _mm_unpacklo_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi32(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
+    c.val = _mm_unpacklo_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi32(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    c.val = _mm_unpacklo_epi64(c0, c1);
+    d.val = _mm_unpackhi_epi64(c0, c1);
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int32x4(_mm_madd_epi16(a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    static const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+
+#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
+inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
+} \
+inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i smask = _mm_set1_epi32(smask32); \
+    __m128i a1 = _mm_xor_si128(a.val, smask); \
+    __m128i b1 = _mm_xor_si128(b.val, smask); \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
+}
+
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(_mm_sqrt_##suffix(res)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(res); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
+
+#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
+inline __m128i v_packq_epi32(__m128i a)
+{
+    __m128i b = _mm_packs_epi32(a, a);
+    return _mm_packs_epi16(b, b);
+}
+
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
+inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    b0.val = _mm_unpacklo_##suffix(a.val, z); \
+    b1.val = _mm_unpackhi_##suffix(a.val, z); \
+} \
+inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
+} \
+inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
+{ \
+    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
+    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
+} \
+inline _Tpwsvec v_load_expand(const _Tps* ptr) \
+{ \
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
+    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
+}
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
+
+inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
+{
+    __m128i z = _mm_setzero_si128();
+    b0.val = _mm_unpacklo_epi32(a.val, z);
+    b1.val = _mm_unpackhi_epi32(a.val, z);
+}
+inline v_uint64x2 v_load_expand(const unsigned* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
+}
+inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
+{
+    __m128i s = _mm_srai_epi32(a.val, 31);
+    b0.val = _mm_unpacklo_epi32(a.val, s);
+    b1.val = _mm_unpackhi_epi32(a.val, s);
+}
+inline v_int64x2 v_load_expand(const int* ptr)
+{
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
+    __m128i s = _mm_srai_epi32(a, 31);
+    return v_int64x2(_mm_unpacklo_epi32(a, s));
+}
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    a = _mm_unpacklo_epi8(a, a);
+    a = _mm_unpacklo_epi8(a, a);
+    return v_int32x4(_mm_srai_epi32(a, 24));
+}
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+// adopted from sse_utils.hpp
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
+    __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
+    __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
+
+    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
+    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
+    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpacklo_epi8(v2, v3);
+    c.val = _mm_unpackhi_epi8(v0, v1);
+    d.val = _mm_unpacklo_epi8(v2, v3);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
+
+    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
+    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
+    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+
+    __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
+
+    a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
+    b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
+    c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
+    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
+    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c )
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi8(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi8(c.val, z);
+
+    __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
+    __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
+    __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
+    __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
+
+    __m128i p10 = _mm_unpacklo_epi32(p00, p01);
+    __m128i p11 = _mm_unpackhi_epi32(p00, p01);
+    __m128i p12 = _mm_unpacklo_epi32(p02, p03);
+    __m128i p13 = _mm_unpackhi_epi32(p02, p03);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 1);
+    p22 = _mm_slli_si128(p22, 1);
+
+    __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
+    __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
+    __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
+    __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
+
+    __m128i p40 = _mm_unpacklo_epi64(p30, p31);
+    __m128i p41 = _mm_unpackhi_epi64(p30, p31);
+    __m128i p42 = _mm_unpacklo_epi64(p32, p33);
+    __m128i p43 = _mm_unpackhi_epi64(p32, p33);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b,
+                                const v_uint16x8& c )
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi16(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi16(c.val, z);
+
+    __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
+    __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
+    __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
+    __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 2);
+    p22 = _mm_slli_si128(p22, 2);
+
+    __m128i p30 = _mm_unpacklo_epi64(p20, p21);
+    __m128i p31 = _mm_unpackhi_epi64(p20, p21);
+    __m128i p32 = _mm_unpacklo_epi64(p22, p23);
+    __m128i p33 = _mm_unpackhi_epi64(p22, p23);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c )
+{
+    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
+    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
+
+    __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint32x4 t0, t1, t2, t3;
+    v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
+    v_store(ptr, t0);
+    v_store(ptr + 4, t1);
+    v_store(ptr + 8, t2);
+    v_store(ptr + 12, t3);
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
+inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
+                                 _Tpvec& b0, _Tpvec& c0 ) \
+{ \
+    _Tpuvec a1, b1, c1; \
+    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix(a1); \
+    b0 = v_reinterpret_as_##suffix(b1); \
+    c0 = v_reinterpret_as_##suffix(c1); \
+} \
+inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
+                                 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
+{ \
+    _Tpuvec a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix(a1); \
+    b0 = v_reinterpret_as_##suffix(b1); \
+    c0 = v_reinterpret_as_##suffix(c1); \
+    d0 = v_reinterpret_as_##suffix(d1); \
+} \
+inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
+                               const _Tpvec& b0, const _Tpvec& c0 ) \
+{ \
+    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
+    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
+    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
+    v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
+} \
+inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
+                               const _Tpvec& c0, const _Tpvec& d0 ) \
+{ \
+    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
+    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
+    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
+    _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
+    v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+}
+
+#endif
diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp
new file mode 100644
index 0000000000..a3f69facca
--- /dev/null
+++ b/modules/hal/src/arithm.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/color.cpp b/modules/hal/src/color.cpp
new file mode 100644
index 0000000000..a3f69facca
--- /dev/null
+++ b/modules/hal/src/color.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/filter.cpp b/modules/hal/src/filter.cpp
new file mode 100644
index 0000000000..a3f69facca
--- /dev/null
+++ b/modules/hal/src/filter.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp
new file mode 100644
index 0000000000..7d0199f00c
--- /dev/null
+++ b/modules/hal/src/mathfuncs.cpp
@@ -0,0 +1,1352 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
+static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
+static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
+static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
+static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
+
+#if CV_NEON
+static inline float32x4_t cv_vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+#endif
+
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    int i = 0;
+    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
+        return;
+#endif
+
+#if CV_SSE2
+    Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
+    __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
+    __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
+    __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
+    __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
+    __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
+
+    for( ; i <= len - 4; i += 4 )
+    {
+        __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
+        __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
+        __m128 mask = _mm_cmplt_ps(ax, ay);
+        __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
+        __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
+        __m128 c2 = _mm_mul_ps(c, c);
+        __m128 a = _mm_mul_ps(c2, p7);
+        a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
+        a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
+        a = _mm_mul_ps(_mm_add_ps(a, p1), c);
+
+        __m128 b = _mm_sub_ps(_90, a);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        b = _mm_sub_ps(_180, a);
+        mask = _mm_cmplt_ps(x, z);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        b = _mm_sub_ps(_360, a);
+        mask = _mm_cmplt_ps(y, z);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        a = _mm_mul_ps(a, scale4);
+        _mm_storeu_ps(angle + i, a);
+    }
+#elif CV_NEON
+    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
+    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
+    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
+    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
+    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
+
+    for( ; i <= len - 4; i += 4 )
+    {
+        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
+        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
+        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
+        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
+        float32x4_t c2 = vmulq_f32(c, c);
+        float32x4_t a = vmulq_f32(c2, p7);
+        a = vmulq_f32(vaddq_f32(a, p5), c2);
+        a = vmulq_f32(vaddq_f32(a, p3), c2);
+        a = vmulq_f32(vaddq_f32(a, p1), c);
+
+        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
+        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
+        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
+
+        vst1q_f32(angle + i, vmulq_f32(a, scale4));
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        float x = X[i], y = Y[i];
+        float ax = std::abs(x), ay = std::abs(y);
+        float a, c, c2;
+        if( ax >= ay )
+        {
+            c = ay/(ax + (float)DBL_EPSILON);
+            c2 = c*c;
+            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+        }
+        else
+        {
+            c = ax/(ay + (float)DBL_EPSILON);
+            c2 = c*c;
+            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+        }
+        if( x < 0 )
+            a = 180.f - a;
+        if( y < 0 )
+            a = 360.f - a;
+        angle[i] = (float)(a*scale);
+    }
+}
+
+
+void magnitude(const float* x, const float* y, float* mag, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
+        v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
+        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
+        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        v_store(mag + i, x0);
+        v_store(mag + i + 4, x1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        float x0 = x[i], y0 = y[i];
+        mag[i] = std::sqrt(x0*x0 + y0*y0);
+    }
+}
+
+void magnitude(const double* x, const double* y, double* mag, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128_64F
+    for( ; i <= len - 4; i += 4 )
+    {
+        v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
+        v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
+        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
+        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        v_store(mag + i, x0);
+        v_store(mag + i + 2, x1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        double x0 = x[i], y0 = y[i];
+        mag[i] = std::sqrt(x0*x0 + y0*y0);
+    }
+}
+
+
+void invSqrt(const float* src, float* dst, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        t0 = v_invsqrt(t0);
+        t1 = v_invsqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = 1/std::sqrt(src[i]);
+}
+
+
+void invSqrt(const double* src, double* dst, int len)
+{
+    int i = 0;
+
+#if CV_SSE2
+    __m128d v_1 = _mm_set1_pd(1.0);
+    for ( ; i <= len - 2; i += 2)
+        _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = 1/std::sqrt(src[i]);
+}
+
+
+void sqrt(const float* src, float* dst, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        t0 = v_sqrt(t0);
+        t1 = v_sqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = std::sqrt(src[i]);
+}
+
+
+void sqrt(const double* src, double* dst, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128_64F
+    for( ; i <= len - 4; i += 4 )
+    {
+        v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
+        t0 = v_sqrt(t0);
+        t1 = v_sqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 2, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = std::sqrt(src[i]);
+}
+
+////////////////////////////////////// EXP /////////////////////////////////////
+
+typedef union
+{
+    struct {
+#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
+        int hi;
+        int lo;
+#else
+        int lo;
+        int hi;
+#endif
+    } i;
+    double d;
+}
+DBLINT;
+
+#define EXPTAB_SCALE 6
+#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
+
+#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
+
+static const double expTab[] = {
+    1.0 * EXPPOLY_32F_A0,
+    1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
+    1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
+    1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
+    1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
+    1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
+    1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
+    1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
+    1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
+    1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
+    1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
+    1.126521618608241899794798643787 * EXPPOLY_32F_A0,
+    1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
+    1.151189229952982705817759635202 * EXPPOLY_32F_A0,
+    1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
+    1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
+    1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
+    1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
+    1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
+    1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
+    1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
+    1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
+    1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
+    1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
+    1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
+    1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
+    1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
+    1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
+    1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
+    1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
+    1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
+    1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
+    1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
+    1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
+    1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
+    1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
+    1.476826145939499311386907480374 * EXPPOLY_32F_A0,
+    1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
+    1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
+    1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
+    1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
+    1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
+    1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
+    1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
+    1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
+    1.628027421857347766848218522014 * EXPPOLY_32F_A0,
+    1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
+    1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
+    1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
+    1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
+    1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
+    1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
+    1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
+    1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
+    1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
+    1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
+    1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
+    1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
+    1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
+    1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
+    1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
+    1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
+    1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
+    1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
+};
+
+
+// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
+#if (defined _MSC_VER && _MSC_VER < 1500) || \
+(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
+#undef CV_SSE2
+#define CV_SSE2 0
+#endif
+
+static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
+static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
+static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
+
+void exp( const float *_x, float *y, int n )
+{
+    static const float
+    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
+    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
+    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
+    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
+
+#undef EXPPOLY
+#define EXPPOLY(x)  \
+(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
+
+    int i = 0;
+    const Cv32suf* x = (const Cv32suf*)_x;
+    Cv32suf buf[4];
+
+#if CV_SSE2
+    if( n >= 8 )
+    {
+        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
+        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
+        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
+
+        static const __m128 mA1 = _mm_set1_ps(A1);
+        static const __m128 mA2 = _mm_set1_ps(A2);
+        static const __m128 mA3 = _mm_set1_ps(A3);
+        static const __m128 mA4 = _mm_set1_ps(A4);
+        bool y_aligned = (size_t)(void*)y % 16 == 0;
+
+        ushort CV_DECL_ALIGNED(16) tab_idx[8];
+
+        for( ; i <= n - 8; i += 8 )
+        {
+            __m128 xf0, xf1;
+            xf0 = _mm_loadu_ps(&x[i].f);
+            xf1 = _mm_loadu_ps(&x[i+4].f);
+            __m128i xi0, xi1, xi2, xi3;
+
+            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
+            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
+
+            __m128d xd0 = _mm_cvtps_pd(xf0);
+            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
+            __m128d xd1 = _mm_cvtps_pd(xf1);
+            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
+
+            xd0 = _mm_mul_pd(xd0, prescale2);
+            xd2 = _mm_mul_pd(xd2, prescale2);
+            xd1 = _mm_mul_pd(xd1, prescale2);
+            xd3 = _mm_mul_pd(xd3, prescale2);
+
+            xi0 = _mm_cvtpd_epi32(xd0);
+            xi2 = _mm_cvtpd_epi32(xd2);
+
+            xi1 = _mm_cvtpd_epi32(xd1);
+            xi3 = _mm_cvtpd_epi32(xd3);
+
+            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
+            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
+            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
+            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
+
+            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
+            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
+
+            xf0 = _mm_mul_ps(xf0, postscale4);
+            xf1 = _mm_mul_ps(xf1, postscale4);
+
+            xi0 = _mm_unpacklo_epi64(xi0, xi2);
+            xi1 = _mm_unpacklo_epi64(xi1, xi3);
+            xi0 = _mm_packs_epi32(xi0, xi1);
+
+            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+
+            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
+            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
+            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
+            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+
+            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
+            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
+
+            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
+
+            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
+            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
+
+            __m128 zf0 = _mm_add_ps(xf0, mA1);
+            __m128 zf1 = _mm_add_ps(xf1, mA1);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
+
+            zf0 = _mm_mul_ps(zf0, yf0);
+            zf1 = _mm_mul_ps(zf1, yf1);
+
+            if( y_aligned )
+            {
+                _mm_store_ps(y + i, zf0);
+                _mm_store_ps(y + i + 4, zf1);
+            }
+            else
+            {
+                _mm_storeu_ps(y + i, zf0);
+                _mm_storeu_ps(y + i + 4, zf1);
+            }
+        }
+    }
+    else
+#endif
+        for( ; i <= n - 4; i += 4 )
+        {
+            double x0 = x[i].f * exp_prescale;
+            double x1 = x[i + 1].f * exp_prescale;
+            double x2 = x[i + 2].f * exp_prescale;
+            double x3 = x[i + 3].f * exp_prescale;
+            int val0, val1, val2, val3, t;
+
+            if( ((x[i].i >> 23) & 255) > 127 + 10 )
+                x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
+                x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
+                x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
+                x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
+
+            val0 = cvRound(x0);
+            val1 = cvRound(x1);
+            val2 = cvRound(x2);
+            val3 = cvRound(x3);
+
+            x0 = (x0 - val0)*exp_postscale;
+            x1 = (x1 - val1)*exp_postscale;
+            x2 = (x2 - val2)*exp_postscale;
+            x3 = (x3 - val3)*exp_postscale;
+
+            t = (val0 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[0].i = t << 23;
+
+            t = (val1 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[1].i = t << 23;
+
+            t = (val2 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[2].i = t << 23;
+
+            t = (val3 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[3].i = t << 23;
+
+            x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+            x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
+            y[i] = (float)x0;
+            y[i + 1] = (float)x1;
+
+            x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+            x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
+            y[i + 2] = (float)x2;
+            y[i + 3] = (float)x3;
+        }
+
+    for( ; i < n; i++ )
+    {
+        double x0 = x[i].f * exp_prescale;
+        int val0, t;
+
+        if( ((x[i].i >> 23) & 255) > 127 + 10 )
+            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        t = (val0 >> EXPTAB_SCALE) + 127;
+        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+
+        buf[0].i = t << 23;
+        x0 = (x0 - val0)*exp_postscale;
+
+        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+    }
+}
+
+void exp( const double *_x, double *y, int n )
+{
+    static const double
+    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
+    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
+    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
+    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
+    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
+    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
+
+#undef EXPPOLY
+#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
+
+    int i = 0;
+    Cv64suf buf[4];
+    const Cv64suf* x = (const Cv64suf*)_x;
+
+#if CV_SSE2
+    static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+    static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
+    static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
+    static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
+
+    static const __m128d mA0 = _mm_set1_pd(A0);
+    static const __m128d mA1 = _mm_set1_pd(A1);
+    static const __m128d mA2 = _mm_set1_pd(A2);
+    static const __m128d mA3 = _mm_set1_pd(A3);
+    static const __m128d mA4 = _mm_set1_pd(A4);
+    static const __m128d mA5 = _mm_set1_pd(A5);
+
+    int CV_DECL_ALIGNED(16) tab_idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
+        __m128i xi0, xi1;
+        xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
+        xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
+        xf0 = _mm_mul_pd(xf0, prescale2);
+        xf1 = _mm_mul_pd(xf1, prescale2);
+
+        xi0 = _mm_cvtpd_epi32(xf0);
+        xi1 = _mm_cvtpd_epi32(xf1);
+        xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
+        xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
+
+        xi0 = _mm_unpacklo_epi64(xi0, xi1);
+        _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
+
+        xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
+        xi0 = _mm_packs_epi32(xi0, xi0);
+        xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+        xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
+        xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+        xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
+        xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
+
+        __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+        __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+        yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
+        yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
+
+        __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
+        __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
+
+        zf0 = _mm_mul_pd(zf0, yf0);
+        zf1 = _mm_mul_pd(zf1, yf1);
+
+        _mm_storeu_pd(y + i, zf0);
+        _mm_storeu_pd(y + i + 2, zf1);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double x0 = x[i].f * exp_prescale;
+        double x1 = x[i + 1].f * exp_prescale;
+        double x2 = x[i + 2].f * exp_prescale;
+        double x3 = x[i + 3].f * exp_prescale;
+
+        double y0, y1, y2, y3;
+        int val0, val1, val2, val3, t;
+
+        t = (int)(x[i].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x0 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+1].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x1 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+2].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x2 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+3].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x3 = t < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        val1 = cvRound(x1);
+        val2 = cvRound(x2);
+        val3 = cvRound(x3);
+
+        x0 = (x0 - val0)*exp_postscale;
+        x1 = (x1 - val1)*exp_postscale;
+        x2 = (x2 - val2)*exp_postscale;
+        x3 = (x3 - val3)*exp_postscale;
+
+        t = (val0 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[0].i = (int64)t << 52;
+
+        t = (val1 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[1].i = (int64)t << 52;
+
+        t = (val2 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[2].i = (int64)t << 52;
+
+        t = (val3 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[3].i = (int64)t << 52;
+
+        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
+        y[i] = y0;
+        y[i + 1] = y1;
+
+        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
+        y[i + 2] = y2;
+        y[i + 3] = y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        double x0 = x[i].f * exp_prescale;
+        int val0, t;
+
+        t = (int)(x[i].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x0 = t < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        t = (val0 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+
+        buf[0].i = (int64)t << 52;
+        x0 = (x0 - val0)*exp_postscale;
+
+        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+    }
+}
+
+#undef EXPTAB_SCALE
+#undef EXPTAB_MASK
+#undef EXPPOLY_32F_A0
+
+/////////////////////////////////////////// LOG ///////////////////////////////////////
+
+#define LOGTAB_SCALE    8
+#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
+#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
+#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
+
+static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
+    0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
+    .00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
+    .00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
+    .01165061721997527263705585198749759001657,    .9884169884169884169884169884169884169884,
+    .01550418653596525274396267235488267033361,    .9846153846153846153846153846153846153846,
+    .01934296284313093139406447562578250654042,    .9808429118773946360153256704980842911877,
+    .02316705928153437593630670221500622574241,    .9770992366412213740458015267175572519084,
+    .02697658769820207233514075539915211265906,    .9733840304182509505703422053231939163498,
+    .03077165866675368732785500469617545604706,    .9696969696969696969696969696969696969697,
+    .03455238150665972812758397481047722976656,    .9660377358490566037735849056603773584906,
+    .03831886430213659461285757856785494368522,    .9624060150375939849624060150375939849624,
+    .04207121392068705056921373852674150839447,    .9588014981273408239700374531835205992509,
+    .04580953603129420126371940114040626212953,    .9552238805970149253731343283582089552239,
+    .04953393512227662748292900118940451648088,    .9516728624535315985130111524163568773234,
+    .05324451451881227759255210685296333394944,    .9481481481481481481481481481481481481481,
+    .05694137640013842427411105973078520037234,    .9446494464944649446494464944649446494465,
+    .06062462181643483993820353816772694699466,    .9411764705882352941176470588235294117647,
+    .06429435070539725460836422143984236754475,    .9377289377289377289377289377289377289377,
+    .06795066190850773679699159401934593915938,    .9343065693430656934306569343065693430657,
+    .07159365318700880442825962290953611955044,    .9309090909090909090909090909090909090909,
+    .07522342123758751775142172846244648098944,    .9275362318840579710144927536231884057971,
+    .07884006170777602129362549021607264876369,    .9241877256317689530685920577617328519856,
+    .08244366921107458556772229485432035289706,    .9208633093525179856115107913669064748201,
+    .08603433734180314373940490213499288074675,    .9175627240143369175627240143369175627240,
+    .08961215868968712416897659522874164395031,    .9142857142857142857142857142857142857143,
+    .09317722485418328259854092721070628613231,    .9110320284697508896797153024911032028470,
+    .09672962645855109897752299730200320482256,    .9078014184397163120567375886524822695035,
+    .10026945316367513738597949668474029749630,    .9045936395759717314487632508833922261484,
+    .10379679368164355934833764649738441221420,    .9014084507042253521126760563380281690141,
+    .10731173578908805021914218968959175981580,    .8982456140350877192982456140350877192982,
+    .11081436634029011301105782649756292812530,    .8951048951048951048951048951048951048951,
+    .11430477128005862852422325204315711744130,    .8919860627177700348432055749128919860627,
+    .11778303565638344185817487641543266363440,    .8888888888888888888888888888888888888889,
+    .12124924363286967987640707633545389398930,    .8858131487889273356401384083044982698962,
+    .12470347850095722663787967121606925502420,    .8827586206896551724137931034482758620690,
+    .12814582269193003360996385708858724683530,    .8797250859106529209621993127147766323024,
+    .13157635778871926146571524895989568904040,    .8767123287671232876712328767123287671233,
+    .13499516453750481925766280255629681050780,    .8737201365187713310580204778156996587031,
+    .13840232285911913123754857224412262439730,    .8707482993197278911564625850340136054422,
+    .14179791186025733629172407290752744302150,    .8677966101694915254237288135593220338983,
+    .14518200984449788903951628071808954700830,    .8648648648648648648648648648648648648649,
+    .14855469432313711530824207329715136438610,    .8619528619528619528619528619528619528620,
+    .15191604202584196858794030049466527998450,    .8590604026845637583892617449664429530201,
+    .15526612891112392955683674244937719777230,    .8561872909698996655518394648829431438127,
+    .15860503017663857283636730244325008243330,    .8533333333333333333333333333333333333333,
+    .16193282026931324346641360989451641216880,    .8504983388704318936877076411960132890365,
+    .16524957289530714521497145597095368430010,    .8476821192052980132450331125827814569536,
+    .16855536102980664403538924034364754334090,    .8448844884488448844884488448844884488449,
+    .17185025692665920060697715143760433420540,    .8421052631578947368421052631578947368421,
+    .17513433212784912385018287750426679849630,    .8393442622950819672131147540983606557377,
+    .17840765747281828179637841458315961062910,    .8366013071895424836601307189542483660131,
+    .18167030310763465639212199675966985523700,    .8338762214983713355048859934853420195440,
+    .18492233849401198964024217730184318497780,    .8311688311688311688311688311688311688312,
+    .18816383241818296356839823602058459073300,    .8284789644012944983818770226537216828479,
+    .19139485299962943898322009772527962923050,    .8258064516129032258064516129032258064516,
+    .19461546769967164038916962454095482826240,    .8231511254019292604501607717041800643087,
+    .19782574332991986754137769821682013571260,    .8205128205128205128205128205128205128205,
+    .20102574606059073203390141770796617493040,    .8178913738019169329073482428115015974441,
+    .20421554142869088876999228432396193966280,    .8152866242038216560509554140127388535032,
+    .20739519434607056602715147164417430758480,    .8126984126984126984126984126984126984127,
+    .21056476910734961416338251183333341032260,    .8101265822784810126582278481012658227848,
+    .21372432939771812687723695489694364368910,    .8075709779179810725552050473186119873817,
+    .21687393830061435506806333251006435602900,    .8050314465408805031446540880503144654088,
+    .22001365830528207823135744547471404075630,    .8025078369905956112852664576802507836991,
+    .22314355131420973710199007200571941211830,    .8000000000000000000000000000000000000000,
+    .22626367865045338145790765338460914790630,    .7975077881619937694704049844236760124611,
+    .22937410106484582006380890106811420992010,    .7950310559006211180124223602484472049689,
+    .23247487874309405442296849741978803649550,    .7925696594427244582043343653250773993808,
+    .23556607131276688371634975283086532726890,    .7901234567901234567901234567901234567901,
+    .23864773785017498464178231643018079921600,    .7876923076923076923076923076923076923077,
+    .24171993688714515924331749374687206000090,    .7852760736196319018404907975460122699387,
+    .24478272641769091566565919038112042471760,    .7828746177370030581039755351681957186544,
+    .24783616390458124145723672882013488560910,    .7804878048780487804878048780487804878049,
+    .25088030628580937353433455427875742316250,    .7781155015197568389057750759878419452888,
+    .25391520998096339667426946107298135757450,    .7757575757575757575757575757575757575758,
+    .25694093089750041913887912414793390780680,    .7734138972809667673716012084592145015106,
+    .25995752443692604627401010475296061486000,    .7710843373493975903614457831325301204819,
+    .26296504550088134477547896494797896593800,    .7687687687687687687687687687687687687688,
+    .26596354849713793599974565040611196309330,    .7664670658682634730538922155688622754491,
+    .26895308734550393836570947314612567424780,    .7641791044776119402985074626865671641791,
+    .27193371548364175804834985683555714786050,    .7619047619047619047619047619047619047619,
+    .27490548587279922676529508862586226314300,    .7596439169139465875370919881305637982196,
+    .27786845100345625159121709657483734190480,    .7573964497041420118343195266272189349112,
+    .28082266290088775395616949026589281857030,    .7551622418879056047197640117994100294985,
+    .28376817313064456316240580235898960381750,    .7529411764705882352941176470588235294118,
+    .28670503280395426282112225635501090437180,    .7507331378299120234604105571847507331378,
+    .28963329258304265634293983566749375313530,    .7485380116959064327485380116959064327485,
+    .29255300268637740579436012922087684273730,    .7463556851311953352769679300291545189504,
+    .29546421289383584252163927885703742504130,    .7441860465116279069767441860465116279070,
+    .29836697255179722709783618483925238251680,    .7420289855072463768115942028985507246377,
+    .30126133057816173455023545102449133992200,    .7398843930635838150289017341040462427746,
+    .30414733546729666446850615102448500692850,    .7377521613832853025936599423631123919308,
+    .30702503529491181888388950937951449304830,    .7356321839080459770114942528735632183908,
+    .30989447772286465854207904158101882785550,    .7335243553008595988538681948424068767908,
+    .31275571000389684739317885942000430077330,    .7314285714285714285714285714285714285714,
+    .31560877898630329552176476681779604405180,    .7293447293447293447293447293447293447293,
+    .31845373111853458869546784626436419785030,    .7272727272727272727272727272727272727273,
+    .32129061245373424782201254856772720813750,    .7252124645892351274787535410764872521246,
+    .32411946865421192853773391107097268104550,    .7231638418079096045197740112994350282486,
+    .32694034499585328257253991068864706903700,    .7211267605633802816901408450704225352113,
+    .32975328637246797969240219572384376078850,    .7191011235955056179775280898876404494382,
+    .33255833730007655635318997155991382896900,    .7170868347338935574229691876750700280112,
+    .33535554192113781191153520921943709254280,    .7150837988826815642458100558659217877095,
+    .33814494400871636381467055798566434532400,    .7130919220055710306406685236768802228412,
+    .34092658697059319283795275623560883104800,    .7111111111111111111111111111111111111111,
+    .34370051385331840121395430287520866841080,    .7091412742382271468144044321329639889197,
+    .34646676734620857063262633346312213689100,    .7071823204419889502762430939226519337017,
+    .34922538978528827602332285096053965389730,    .7052341597796143250688705234159779614325,
+    .35197642315717814209818925519357435405250,    .7032967032967032967032967032967032967033,
+    .35471990910292899856770532096561510115850,    .7013698630136986301369863013698630136986,
+    .35745588892180374385176833129662554711100,    .6994535519125683060109289617486338797814,
+    .36018440357500774995358483465679455548530,    .6975476839237057220708446866485013623978,
+    .36290549368936841911903457003063522279280,    .6956521739130434782608695652173913043478,
+    .36561919956096466943762379742111079394830,    .6937669376693766937669376693766937669377,
+    .36832556115870762614150635272380895912650,    .6918918918918918918918918918918918918919,
+    .37102461812787262962487488948681857436900,    .6900269541778975741239892183288409703504,
+    .37371640979358405898480555151763837784530,    .6881720430107526881720430107526881720430,
+    .37640097516425302659470730759494472295050,    .6863270777479892761394101876675603217158,
+    .37907835293496944251145919224654790014030,    .6844919786096256684491978609625668449198,
+    .38174858149084833769393299007788300514230,    .6826666666666666666666666666666666666667,
+    .38441169891033200034513583887019194662580,    .6808510638297872340425531914893617021277,
+    .38706774296844825844488013899535872042180,    .6790450928381962864721485411140583554377,
+    .38971675114002518602873692543653305619950,    .6772486772486772486772486772486772486772,
+    .39235876060286384303665840889152605086580,    .6754617414248021108179419525065963060686,
+    .39499380824086893770896722344332374632350,    .6736842105263157894736842105263157894737,
+    .39762193064713846624158577469643205404280,    .6719160104986876640419947506561679790026,
+    .40024316412701266276741307592601515352730,    .6701570680628272251308900523560209424084,
+    .40285754470108348090917615991202183067800,    .6684073107049608355091383812010443864230,
+    .40546510810816432934799991016916465014230,    .6666666666666666666666666666666666666667,
+    .40806588980822172674223224930756259709600,    .6649350649350649350649350649350649350649,
+    .41065992498526837639616360320360399782650,    .6632124352331606217616580310880829015544,
+    .41324724855021932601317757871584035456180,    .6614987080103359173126614987080103359173,
+    .41582789514371093497757669865677598863850,    .6597938144329896907216494845360824742268,
+    .41840189913888381489925905043492093682300,    .6580976863753213367609254498714652956298,
+    .42096929464412963239894338585145305842150,    .6564102564102564102564102564102564102564,
+    .42353011550580327293502591601281892508280,    .6547314578005115089514066496163682864450,
+    .42608439531090003260516141381231136620050,    .6530612244897959183673469387755102040816,
+    .42863216738969872610098832410585600882780,    .6513994910941475826972010178117048346056,
+    .43117346481837132143866142541810404509300,    .6497461928934010152284263959390862944162,
+    .43370832042155937902094819946796633303180,    .6481012658227848101265822784810126582278,
+    .43623676677491801667585491486534010618930,    .6464646464646464646464646464646464646465,
+    .43875883620762790027214350629947148263450,    .6448362720403022670025188916876574307305,
+    .44127456080487520440058801796112675219780,    .6432160804020100502512562814070351758794,
+    .44378397241030093089975139264424797147500,    .6416040100250626566416040100250626566416,
+    .44628710262841947420398014401143882423650,    .6400000000000000000000000000000000000000,
+    .44878398282700665555822183705458883196130,    .6384039900249376558603491271820448877805,
+    .45127464413945855836729492693848442286250,    .6368159203980099502487562189054726368159,
+    .45375911746712049854579618113348260521900,    .6352357320099255583126550868486352357320,
+    .45623743348158757315857769754074979573500,    .6336633663366336633663366336633663366337,
+    .45870962262697662081833982483658473938700,    .6320987654320987654320987654320987654321,
+    .46117571512217014895185229761409573256980,    .6305418719211822660098522167487684729064,
+    .46363574096303250549055974261136725544930,    .6289926289926289926289926289926289926290,
+    .46608972992459918316399125615134835243230,    .6274509803921568627450980392156862745098,
+    .46853771156323925639597405279346276074650,    .6259168704156479217603911980440097799511,
+    .47097971521879100631480241645476780831830,    .6243902439024390243902439024390243902439,
+    .47341577001667212165614273544633761048330,    .6228710462287104622871046228710462287105,
+    .47584590486996386493601107758877333253630,    .6213592233009708737864077669902912621359,
+    .47827014848147025860569669930555392056700,    .6198547215496368038740920096852300242131,
+    .48068852934575190261057286988943815231330,    .6183574879227053140096618357487922705314,
+    .48310107575113581113157579238759353756900,    .6168674698795180722891566265060240963855,
+    .48550781578170076890899053978500887751580,    .6153846153846153846153846153846153846154,
+    .48790877731923892879351001283794175833480,    .6139088729016786570743405275779376498801,
+    .49030398804519381705802061333088204264650,    .6124401913875598086124401913875598086124,
+    .49269347544257524607047571407747454941280,    .6109785202863961813842482100238663484487,
+    .49507726679785146739476431321236304938800,    .6095238095238095238095238095238095238095,
+    .49745538920281889838648226032091770321130,    .6080760095011876484560570071258907363420,
+    .49982786955644931126130359189119189977650,    .6066350710900473933649289099526066350711,
+    .50219473456671548383667413872899487614650,    .6052009456264775413711583924349881796690,
+    .50455601075239520092452494282042607665050,    .6037735849056603773584905660377358490566,
+    .50691172444485432801997148999362252652650,    .6023529411764705882352941176470588235294,
+    .50926190178980790257412536448100581765150,    .6009389671361502347417840375586854460094,
+    .51160656874906207391973111953120678663250,    .5995316159250585480093676814988290398126,
+    .51394575110223428282552049495279788970950,    .5981308411214953271028037383177570093458,
+    .51627947444845445623684554448118433356300,    .5967365967365967365967365967365967365967,
+    .51860776420804555186805373523384332656850,    .5953488372093023255813953488372093023256,
+    .52093064562418522900344441950437612831600,    .5939675174013921113689095127610208816705,
+    .52324814376454775732838697877014055848100,    .5925925925925925925925925925925925925926,
+    .52556028352292727401362526507000438869000,    .5912240184757505773672055427251732101617,
+    .52786708962084227803046587723656557500350,    .5898617511520737327188940092165898617512,
+    .53016858660912158374145519701414741575700,    .5885057471264367816091954022988505747126,
+    .53246479886947173376654518506256863474850,    .5871559633027522935779816513761467889908,
+    .53475575061602764748158733709715306758900,    .5858123569794050343249427917620137299771,
+    .53704146589688361856929077475797384977350,    .5844748858447488584474885844748858447489,
+    .53932196859560876944783558428753167390800,    .5831435079726651480637813211845102505695,
+    .54159728243274429804188230264117009937750,    .5818181818181818181818181818181818181818,
+    .54386743096728351609669971367111429572100,    .5804988662131519274376417233560090702948,
+    .54613243759813556721383065450936555862450,    .5791855203619909502262443438914027149321,
+    .54839232556557315767520321969641372561450,    .5778781038374717832957110609480812641084,
+    .55064711795266219063194057525834068655950,    .5765765765765765765765765765765765765766,
+    .55289683768667763352766542084282264113450,    .5752808988764044943820224719101123595506,
+    .55514150754050151093110798683483153581600,    .5739910313901345291479820627802690582960,
+    .55738115013400635344709144192165695130850,    .5727069351230425055928411633109619686801,
+    .55961578793542265941596269840374588966350,    .5714285714285714285714285714285714285714,
+    .56184544326269181269140062795486301183700,    .5701559020044543429844097995545657015590,
+    .56407013828480290218436721261241473257550,    .5688888888888888888888888888888888888889,
+    .56628989502311577464155334382667206227800,    .5676274944567627494456762749445676274945,
+    .56850473535266865532378233183408156037350,    .5663716814159292035398230088495575221239,
+    .57071468100347144680739575051120482385150,    .5651214128035320088300220750551876379691,
+    .57291975356178548306473885531886480748650,    .5638766519823788546255506607929515418502,
+    .57511997447138785144460371157038025558000,    .5626373626373626373626373626373626373626,
+    .57731536503482350219940144597785547375700,    .5614035087719298245614035087719298245614,
+    .57950594641464214795689713355386629700650,    .5601750547045951859956236323851203501094,
+    .58169173963462239562716149521293118596100,    .5589519650655021834061135371179039301310,
+    .58387276558098266665552955601015128195300,    .5577342047930283224400871459694989106754,
+    .58604904500357812846544902640744112432000,    .5565217391304347826086956521739130434783,
+    .58822059851708596855957011939608491957200,    .5553145336225596529284164859002169197397,
+    .59038744660217634674381770309992134571100,    .5541125541125541125541125541125541125541,
+    .59254960960667157898740242671919986605650,    .5529157667386609071274298056155507559395,
+    .59470710774669277576265358220553025603300,    .5517241379310344827586206896551724137931,
+    .59685996110779382384237123915227130055450,    .5505376344086021505376344086021505376344,
+    .59900818964608337768851242799428291618800,    .5493562231759656652360515021459227467811,
+    .60115181318933474940990890900138765573500,    .5481798715203426124197002141327623126338,
+    .60329085143808425240052883964381180703650,    .5470085470085470085470085470085470085470,
+    .60542532396671688843525771517306566238400,    .5458422174840085287846481876332622601279,
+    .60755525022454170969155029524699784815300,    .5446808510638297872340425531914893617021,
+    .60968064953685519036241657886421307921400,    .5435244161358811040339702760084925690021,
+    .61180154110599282990534675263916142284850,    .5423728813559322033898305084745762711864,
+    .61391794401237043121710712512140162289150,    .5412262156448202959830866807610993657505,
+    .61602987721551394351138242200249806046500,    .5400843881856540084388185654008438818565,
+    .61813735955507864705538167982012964785100,    .5389473684210526315789473684210526315789,
+    .62024040975185745772080281312810257077200,    .5378151260504201680672268907563025210084,
+    .62233904640877868441606324267922900617100,    .5366876310272536687631027253668763102725,
+    .62443328801189346144440150965237990021700,    .5355648535564853556485355648535564853556,
+    .62652315293135274476554741340805776417250,    .5344467640918580375782881002087682672234,
+    .62860865942237409420556559780379757285100,    .5333333333333333333333333333333333333333,
+    .63068982562619868570408243613201193511500,    .5322245322245322245322245322245322245322,
+    .63276666957103777644277897707070223987100,    .5311203319502074688796680497925311203320,
+    .63483920917301017716738442686619237065300,    .5300207039337474120082815734989648033126,
+    .63690746223706917739093569252872839570050,    .5289256198347107438016528925619834710744,
+    .63897144645792069983514238629140891134750,    .5278350515463917525773195876288659793814,
+    .64103117942093124081992527862894348800200,    .5267489711934156378600823045267489711934,
+    .64308667860302726193566513757104985415950,    .5256673511293634496919917864476386036961,
+    .64513796137358470073053240412264131009600,    .5245901639344262295081967213114754098361,
+    .64718504499530948859131740391603671014300,    .5235173824130879345603271983640081799591,
+    .64922794662510974195157587018911726772800,    .5224489795918367346938775510204081632653,
+    .65126668331495807251485530287027359008800,    .5213849287169042769857433808553971486762,
+    .65330127201274557080523663898929953575150,    .5203252032520325203252032520325203252033,
+    .65533172956312757406749369692988693714150,    .5192697768762677484787018255578093306288,
+    .65735807270835999727154330685152672231200,    .5182186234817813765182186234817813765182,
+    .65938031808912778153342060249997302889800,    .5171717171717171717171717171717171717172,
+    .66139848224536490484126716182800009846700,    .5161290322580645161290322580645161290323,
+    .66341258161706617713093692145776003599150,    .5150905432595573440643863179074446680080,
+    .66542263254509037562201001492212526500250,    .5140562248995983935742971887550200803213,
+    .66742865127195616370414654738851822912700,    .5130260521042084168336673346693386773547,
+    .66943065394262923906154583164607174694550,    .5120000000000000000000000000000000000000,
+    .67142865660530226534774556057527661323550,    .5109780439121756487025948103792415169661,
+    .67342267521216669923234121597488410770900,    .5099601593625498007968127490039840637450,
+    .67541272562017662384192817626171745359900,    .5089463220675944333996023856858846918489,
+    .67739882359180603188519853574689477682100,    .5079365079365079365079365079365079365079,
+    .67938098479579733801614338517538271844400,    .5069306930693069306930693069306930693069,
+    .68135922480790300781450241629499942064300,    .5059288537549407114624505928853754940711,
+    .68333355911162063645036823800182901322850,    .5049309664694280078895463510848126232742,
+    .68530400309891936760919861626462079584600,    .5039370078740157480314960629921259842520,
+    .68727057207096020619019327568821609020250,    .5029469548133595284872298624754420432220,
+    .68923328123880889251040571252815425395950,    .5019607843137254901960784313725490196078,
+    .69314718055994530941723212145818, 5.0e-01,
+};
+
+
+
+#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
+static const double ln_2 = 0.69314718055994530941723212145818;
+
+void log( const float *_x, float *y, int n )
+{
+    static const float shift[] = { 0, -1.f/512 };
+    static const float
+    A0 = 0.3333333333333333333333333f,
+    A1 = -0.5f,
+    A2 = 1.f;
+
+#undef LOGPOLY
+#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
+
+    int i = 0;
+    Cv32suf buf[4];
+    const int* x = (const int*)_x;
+
+#if CV_SSE2
+    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+    static const __m128 _1_4 = _mm_set1_ps(1.f);
+    static const __m128 shift4 = _mm_set1_ps(-1.f/512);
+
+    static const __m128 mA0 = _mm_set1_ps(A0);
+    static const __m128 mA1 = _mm_set1_ps(A1);
+    static const __m128 mA2 = _mm_set1_ps(A2);
+
+    int CV_DECL_ALIGNED(16) idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
+        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
+
+        __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
+
+        h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
+        _mm_store_si128((__m128i*)idx, h0);
+        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+        __m128d t0, t1, t2, t3, t4;
+        t0 = _mm_load_pd(icvLogTab + idx[0]);
+        t2 = _mm_load_pd(icvLogTab + idx[1]);
+        t1 = _mm_unpackhi_pd(t0, t2);
+        t0 = _mm_unpacklo_pd(t0, t2);
+        t2 = _mm_load_pd(icvLogTab + idx[2]);
+        t4 = _mm_load_pd(icvLogTab + idx[3]);
+        t3 = _mm_unpackhi_pd(t2, t4);
+        t2 = _mm_unpacklo_pd(t2, t4);
+
+        yd0 = _mm_add_pd(yd0, t0);
+        yd1 = _mm_add_pd(yd1, t2);
+
+        __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+
+        __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
+        xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
+        xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
+
+        __m128 zf0 = _mm_mul_ps(xf0, mA0);
+        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
+        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
+        yf0 = _mm_add_ps(yf0, zf0);
+
+        _mm_storeu_ps(y + i, yf0);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double x0, x1, x2, x3;
+        double y0, y1, y2, y3;
+        int h0, h1, h2, h3;
+
+        h0 = x[i];
+        h1 = x[i+1];
+        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
+        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
+
+        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
+
+        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        y1 += icvLogTab[h1];
+
+        h2 = x[i+2];
+        h3 = x[i+3];
+
+        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
+        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
+
+        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
+        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
+
+        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
+        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
+
+        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y2 += icvLogTab[h2];
+        y3 += icvLogTab[h3];
+
+        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
+        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
+
+        x0 += shift[h0 == 510];
+        x1 += shift[h1 == 510];
+        y0 += LOGPOLY( x0 );
+        y1 += LOGPOLY( x1 );
+
+        y[i] = (float) y0;
+        y[i + 1] = (float) y1;
+
+        x2 += shift[h2 == 510];
+        x3 += shift[h3 == 510];
+        y2 += LOGPOLY( x2 );
+        y3 += LOGPOLY( x3 );
+
+        y[i + 2] = (float) y2;
+        y[i + 3] = (float) y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        int h0 = x[i];
+        double y0;
+        float x0;
+
+        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+
+        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
+        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
+        x0 += shift[h0 == 510];
+        y0 += LOGPOLY( x0 );
+
+        y[i] = (float)y0;
+    }
+}
+
+void log( const double *x, double *y, int n )
+{
+    static const double shift[] = { 0, -1./512 };
+    static const double
+    A7 = 1.0,
+    A6 = -0.5,
+    A5 = 0.333333333333333314829616256247390992939472198486328125,
+    A4 = -0.25,
+    A3 = 0.2,
+    A2 = -0.1666666666666666574148081281236954964697360992431640625,
+    A1 = 0.1428571428571428769682682968777953647077083587646484375,
+    A0 = -0.125;
+
+#undef LOGPOLY
+#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
+(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
+(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
+
+    int i = 0;
+    DBLINT buf[4];
+    DBLINT *X = (DBLINT *) x;
+
+#if CV_SSE2
+    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+    static const __m128d _1_2 = _mm_set1_pd(1.);
+    static const __m128d shift2 = _mm_set1_pd(-1./512);
+
+    static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
+    static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+
+    static const __m128d mA0 = _mm_set1_pd(A0);
+    static const __m128d mA1 = _mm_set1_pd(A1);
+    static const __m128d mA2 = _mm_set1_pd(A2);
+    static const __m128d mA3 = _mm_set1_pd(A3);
+    static const __m128d mA4 = _mm_set1_pd(A4);
+    static const __m128d mA5 = _mm_set1_pd(A5);
+    static const __m128d mA6 = _mm_set1_pd(A6);
+    static const __m128d mA7 = _mm_set1_pd(A7);
+
+    int CV_DECL_ALIGNED(16) idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+        __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
+
+        __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
+        __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
+
+        h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
+
+        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
+                                                  _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
+        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
+
+        h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
+        _mm_store_si128((__m128i*)idx, h0);
+        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+        __m128d t0, t1, t2, t3, t4;
+        t0 = _mm_load_pd(icvLogTab + idx[0]);
+        t2 = _mm_load_pd(icvLogTab + idx[1]);
+        t1 = _mm_unpackhi_pd(t0, t2);
+        t0 = _mm_unpacklo_pd(t0, t2);
+        t2 = _mm_load_pd(icvLogTab + idx[2]);
+        t4 = _mm_load_pd(icvLogTab + idx[3]);
+        t3 = _mm_unpackhi_pd(t2, t4);
+        t2 = _mm_unpacklo_pd(t2, t4);
+
+        yd0 = _mm_add_pd(yd0, t0);
+        yd1 = _mm_add_pd(yd1, t2);
+
+        xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
+        xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
+
+        xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
+        xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
+
+        __m128d zd0 = _mm_mul_pd(xd0, mA0);
+        __m128d zd1 = _mm_mul_pd(xd1, mA0);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
+
+        yd0 = _mm_add_pd(yd0, zd0);
+        yd1 = _mm_add_pd(yd1, zd1);
+
+        _mm_storeu_pd(y + i, yd0);
+        _mm_storeu_pd(y + i + 2, yd1);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double xq;
+        double x0, x1, x2, x3;
+        double y0, y1, y2, y3;
+        int h0, h1, h2, h3;
+
+        h0 = X[i].i.lo;
+        h1 = X[i + 1].i.lo;
+        buf[0].i.lo = h0;
+        buf[1].i.lo = h1;
+
+        h0 = X[i].i.hi;
+        h1 = X[i + 1].i.hi;
+        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
+        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
+
+        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
+        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        h2 = X[i + 2].i.lo;
+        h3 = X[i + 3].i.lo;
+        buf[2].i.lo = h2;
+        buf[3].i.lo = h3;
+
+        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        y1 += icvLogTab[h1];
+
+        h2 = X[i + 2].i.hi;
+        h3 = X[i + 3].i.hi;
+
+        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
+        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
+
+        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
+        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
+
+        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
+        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y2 += icvLogTab[h2];
+        y3 += icvLogTab[h3];
+
+        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
+        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
+
+        y0 += LOGPOLY( x0, h0 == 510 );
+        y1 += LOGPOLY( x1, h1 == 510 );
+
+        y[i] = y0;
+        y[i + 1] = y1;
+
+        y2 += LOGPOLY( x2, h2 == 510 );
+        y3 += LOGPOLY( x3, h3 == 510 );
+
+        y[i + 2] = y2;
+        y[i + 3] = y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        int h0 = X[i].i.hi;
+        double xq;
+        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
+        buf[0].i.lo = X[i].i.lo;
+        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
+        y0 += LOGPOLY( x0, h0 == 510 );
+        y[i] = y0;
+    }
+}
+
+}}
diff --git a/modules/hal/src/matrix.cpp b/modules/hal/src/matrix.cpp
new file mode 100644
index 0000000000..9506aaf478
--- /dev/null
+++ b/modules/hal/src/matrix.cpp
@@ -0,0 +1,208 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+/****************************************************************************************\
+*                     LU & Cholesky implementation for small matrices                    *
+\****************************************************************************************/
+
+template<typename _Tp> static inline int
+LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    int i, j, k, p = 1;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        k = i;
+
+        for( j = i+1; j < m; j++ )
+            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
+                k = j;
+
+        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
+            return 0;
+
+        if( k != i )
+        {
+            for( j = i; j < m; j++ )
+                std::swap(A[i*astep + j], A[k*astep + j]);
+            if( b )
+                for( j = 0; j < n; j++ )
+                    std::swap(b[i*bstep + j], b[k*bstep + j]);
+            p = -p;
+        }
+
+        _Tp d = -1/A[i*astep + i];
+
+        for( j = i+1; j < m; j++ )
+        {
+            _Tp alpha = A[j*astep + i]*d;
+
+            for( k = i+1; k < m; k++ )
+                A[j*astep + k] += alpha*A[i*astep + k];
+
+            if( b )
+                for( k = 0; k < n; k++ )
+                    b[j*bstep + k] += alpha*b[i*bstep + k];
+        }
+
+        A[i*astep + i] = -d;
+    }
+
+    if( b )
+    {
+        for( i = m-1; i >= 0; i-- )
+            for( j = 0; j < n; j++ )
+            {
+                _Tp s = b[i*bstep + j];
+                for( k = i+1; k < m; k++ )
+                    s -= A[i*astep + k]*b[k*bstep + j];
+                b[i*bstep + j] = s*A[i*astep + i];
+            }
+    }
+
+    return p;
+}
+
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+template<typename _Tp> static inline bool
+CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    _Tp* L = A;
+    int i, j, k;
+    double s;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < i; j++ )
+        {
+            s = A[i*astep + j];
+            for( k = 0; k < j; k++ )
+                s -= L[i*astep + k]*L[j*astep + k];
+            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
+        }
+        s = A[i*astep + i];
+        for( k = 0; k < j; k++ )
+        {
+            double t = L[i*astep + k];
+            s -= t*t;
+        }
+        if( s < std::numeric_limits<_Tp>::epsilon() )
+            return false;
+        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
+    }
+
+    if( !b )
+        return true;
+
+    // LLt x = b
+    // 1: L y = b
+    // 2. Lt x = y
+
+    /*
+     [ L00             ]  y0   b0
+     [ L10 L11         ]  y1 = b1
+     [ L20 L21 L22     ]  y2   b2
+     [ L30 L31 L32 L33 ]  y3   b3
+
+     [ L00 L10 L20 L30 ]  x0   y0
+     [     L11 L21 L31 ]  x1 = y1
+     [         L22 L32 ]  x2   y2
+     [             L33 ]  x3   y3
+     */
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = 0; k < i; k++ )
+                s -= L[i*astep + k]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    for( i = m-1; i >= 0; i-- )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = m-1; k > i; k-- )
+                s -= L[k*astep + i]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    return true;
+}
+
+
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+}}
diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp
new file mode 100644
index 0000000000..630565bec3
--- /dev/null
+++ b/modules/hal/src/precomp.hpp
@@ -0,0 +1,49 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/hal.hpp"
+#include "opencv2/hal/intrin.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <float.h>
diff --git a/modules/hal/src/resize.cpp b/modules/hal/src/resize.cpp
new file mode 100644
index 0000000000..a3f69facca
--- /dev/null
+++ b/modules/hal/src/resize.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/stat.cpp b/modules/hal/src/stat.cpp
new file mode 100644
index 0000000000..ec3b8db5a1
--- /dev/null
+++ b/modules/hal/src/stat.cpp
@@ -0,0 +1,306 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+static const uchar popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+int normHamming(const uchar* a, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t bitsSet = vcntq_u8 (A_vec);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
+            popCountTable[a[i+2]] + popCountTable[a[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
+#endif
+    for( ; i < n; i++ )
+        result += tab[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+    #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+}} //cv::hal
diff --git a/modules/hal/src/warp.cpp b/modules/hal/src/warp.cpp
new file mode 100644
index 0000000000..a3f69facca
--- /dev/null
+++ b/modules/hal/src/warp.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index c89918ee26..d0aad919ee 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -117,7 +117,7 @@ CV_IMPL void cvAddText(const CvArr* img, const char* text, CvPoint org, CvFont*
         "putText",
         autoBlockingConnection(),
         Q_ARG(void*, (void*) img),
-        Q_ARG(QString,QString(text)),
+        Q_ARG(QString,QString::fromUtf8(text)),
         Q_ARG(QPoint, QPoint(org.x,org.y)),
         Q_ARG(void*,(void*) font));
 }
@@ -418,12 +418,14 @@ static CvBar* icvFindBarByName(QBoxLayout* layout, QString name_bar, typeBar typ
 static CvTrackbar* icvFindTrackBarByName(const char* name_trackbar, const char* name_window, QBoxLayout* layout = NULL)
 {
     QString nameQt(name_trackbar);
-    if ((!name_window || !name_window[0]) && global_control_panel) //window name is null and we have a control panel
+    QString nameWinQt(name_window);
+
+    if (nameWinQt.isEmpty() && global_control_panel) //window name is null and we have a control panel
         layout = global_control_panel->myLayout;
 
     if (!layout)
     {
-        QPointer<CvWindow> w = icvFindWindowByName(QLatin1String(name_window));
+        QPointer<CvWindow> w = icvFindWindowByName(nameWinQt);
 
         if (!w)
             CV_Error(CV_StsNullPtr, "NULL window handler");
@@ -1875,7 +1877,7 @@ bool CvWindow::isOpenGl()
 
 void CvWindow::setViewportSize(QSize _size)
 {
-    myView->getWidget()->resize(_size);
+    resize(_size);
     myView->setSize(_size);
 }
 
diff --git a/modules/imgcodecs/CMakeLists.txt b/modules/imgcodecs/CMakeLists.txt
index 50e2d5da64..6d565217a0 100644
--- a/modules/imgcodecs/CMakeLists.txt
+++ b/modules/imgcodecs/CMakeLists.txt
@@ -1,7 +1,3 @@
-if(WINRT)
-  ocv_module_disable(imgcodecs)
-endif()
-
 set(the_description "Image codecs")
 ocv_add_module(imgcodecs opencv_imgproc WRAP java python)
 
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index b0c942172c..30846efeac 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -185,13 +185,14 @@ compression parameters :
 
     void createAlphaMat(Mat &mat)
     {
+        CV_Assert(mat.channels() == 4);
         for (int i = 0; i < mat.rows; ++i) {
             for (int j = 0; j < mat.cols; ++j) {
-                Vec4b& rgba = mat.at<Vec4b>(i, j);
-                rgba[0] = UCHAR_MAX;
-                rgba[1] = saturate_cast<uchar>((float (mat.cols - j)) / ((float)mat.cols) * UCHAR_MAX);
-                rgba[2] = saturate_cast<uchar>((float (mat.rows - i)) / ((float)mat.rows) * UCHAR_MAX);
-                rgba[3] = saturate_cast<uchar>(0.5 * (rgba[1] + rgba[2]));
+                Vec4b& bgra = mat.at<Vec4b>(i, j);
+                bgra[0] = UCHAR_MAX; // Blue
+                bgra[1] = saturate_cast<uchar>((float (mat.cols - j)) / ((float)mat.cols) * UCHAR_MAX); // Green
+                bgra[2] = saturate_cast<uchar>((float (mat.rows - i)) / ((float)mat.rows) * UCHAR_MAX); // Red
+                bgra[3] = saturate_cast<uchar>(0.5 * (bgra[1] + bgra[2])); // Alpha
             }
         }
     }
diff --git a/modules/imgcodecs/src/grfmt_jpeg2000.cpp b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
index 83fd55a594..e499c58b89 100644
--- a/modules/imgcodecs/src/grfmt_jpeg2000.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
@@ -45,6 +45,7 @@
 #ifdef HAVE_JASPER
 
 #include "grfmt_jpeg2000.hpp"
+#include "opencv2/imgproc.hpp"
 
 #ifdef WIN32
 #define JAS_WIN_MSVC_BUILD 1
@@ -159,6 +160,21 @@ bool  Jpeg2KDecoder::readData( Mat& img )
     jas_stream_t* stream = (jas_stream_t*)m_stream;
     jas_image_t* image = (jas_image_t*)m_image;
 
+#ifndef WIN32
+    // At least on some Linux instances the
+    // system libjasper segfaults when
+    // converting color to grey.
+    // We do this conversion manually at the end.
+    Mat clr;
+    if (CV_MAT_CN(img.type()) < CV_MAT_CN(this->type()))
+    {
+        clr.create(img.size().height, img.size().width, this->type());
+        color = true;
+        data = clr.ptr();
+        step = (int)clr.step;
+    }
+#endif
+
     if( stream && image )
     {
         bool convert;
@@ -171,7 +187,7 @@ bool  Jpeg2KDecoder::readData( Mat& img )
         else
         {
             convert = (jas_clrspc_fam( jas_image_clrspc( image ) ) != JAS_CLRSPC_FAM_GRAY);
-            colorspace = JAS_CLRSPC_SGRAY; // TODO GENGRAY or SGRAY?
+            colorspace = JAS_CLRSPC_SGRAY; // TODO GENGRAY or SGRAY? (GENGRAY fails on Win.)
         }
 
         // convert to the desired colorspace
@@ -256,6 +272,13 @@ bool  Jpeg2KDecoder::readData( Mat& img )
 
     close();
 
+#ifndef WIN32
+    if (!clr.empty())
+    {
+        cv::cvtColor(clr, img, COLOR_BGR2GRAY);
+    }
+#endif
+
     return result;
 }
 
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 8526a4a3f0..383c25a2b3 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -374,15 +374,8 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
                 type = CV_MAKETYPE(CV_MAT_DEPTH(type), 1);
         }
 
-        // established the required input image size.
-        CvSize size;
-        size.width = decoder->width();
-        size.height = decoder->height();
-
-        Mat mat;
-        mat.create(size.height, size.width, type);
-
         // read the image data
+        Mat mat(decoder->height(), decoder->width(), type);
         if (!decoder->readData(mat))
         {
             break;
diff --git a/modules/imgcodecs/test/test_drawing.cpp b/modules/imgcodecs/test/test_drawing.cpp
index f4e157fb20..d6d76822b5 100644
--- a/modules/imgcodecs/test/test_drawing.cpp
+++ b/modules/imgcodecs/test/test_drawing.cpp
@@ -448,3 +448,81 @@ protected:
 };
 
 TEST(Imgcodecs_Drawing, fillconvexpoly_clipping) { CV_FillConvexPolyTest test; test.safe_run(); }
+
+class CV_DrawingTest_UTF8 : public cvtest::BaseTest
+{
+public:
+    CV_DrawingTest_UTF8() {}
+    ~CV_DrawingTest_UTF8() {}
+protected:
+    void run(int)
+    {
+        vector<string> lines;
+        lines.push_back("abcdefghijklmnopqrstuvwxyz1234567890");
+        // cyrillic letters small
+        lines.push_back("\xD0\xB0\xD0\xB1\xD0\xB2\xD0\xB3\xD0\xB4\xD0\xB5\xD1\x91\xD0\xB6\xD0\xB7"
+                        "\xD0\xB8\xD0\xB9\xD0\xBA\xD0\xBB\xD0\xBC\xD0\xBD\xD0\xBE\xD0\xBF\xD1\x80"
+                        "\xD1\x81\xD1\x82\xD1\x83\xD1\x84\xD1\x85\xD1\x86\xD1\x87\xD1\x88\xD1\x89"
+                        "\xD1\x8A\xD1\x8B\xD1\x8C\xD1\x8D\xD1\x8E\xD1\x8F");
+        // cyrillic letters capital
+        lines.push_back("\xD0\x90\xD0\x91\xD0\x92\xD0\x93\xD0\x94\xD0\x95\xD0\x81\xD0\x96\xD0\x97"
+                        "\xD0\x98\xD0\x99\xD0\x9A\xD0\x9B\xD0\x9C\xD0\x9D\xD0\x9E\xD0\x9F\xD0\xA0"
+                        "\xD0\xA1\xD0\xA2\xD0\xA3\xD0\xA4\xD0\xA5\xD0\xA6\xD0\xA7\xD0\xA8\xD0\xA9"
+                        "\xD0\xAA\xD0\xAB\xD0\xAC\xD0\xAD\xD0\xAE\xD0\xAF");
+        // bounds
+        lines.push_back("-\xD0\x80-\xD0\x8E-\xD0\x8F-");
+        lines.push_back("-\xD1\x90-\xD1\x91-\xD1\xBF-");
+        // bad utf8
+        lines.push_back("-\x81-\x82-\x83-");
+        lines.push_back("--\xF0--");
+        lines.push_back("-\xF0");
+
+        vector<int> fonts;
+        fonts.push_back(FONT_HERSHEY_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_PLAIN);
+        fonts.push_back(FONT_HERSHEY_DUPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX);
+        fonts.push_back(FONT_HERSHEY_TRIPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX_SMALL);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_COMPLEX);
+
+        vector<Mat> results;
+        Size bigSize(0, 0);
+        for (vector<int>::const_iterator font = fonts.begin(); font != fonts.end(); ++font)
+        {
+            for (int italic = 0; italic <= FONT_ITALIC; italic += FONT_ITALIC)
+            {
+                for (vector<string>::const_iterator line = lines.begin(); line != lines.end(); ++line)
+                {
+                    const float fontScale = 1;
+                    const int thickness = 1;
+                    const Scalar color(20,20,20);
+                    int baseline = 0;
+
+                    Size textSize = getTextSize(*line, *font | italic, fontScale, thickness, &baseline);
+                    Point textOrg(0, textSize.height + 2);
+                    Mat img(textSize + Size(0, baseline), CV_8UC3, Scalar(255, 255, 255));
+                    putText(img, *line, textOrg, *font | italic, fontScale, color, thickness, CV_AA);
+
+                    results.push_back(img);
+                    bigSize.width = max(bigSize.width, img.size().width);
+                    bigSize.height += img.size().height + 1;
+                }
+            }
+        }
+
+        int shift = 0;
+        Mat result(bigSize, CV_8UC3, Scalar(100, 100, 100));
+        for (vector<Mat>::const_iterator img = results.begin(); img != results.end(); ++img)
+        {
+            Rect roi(Point(0, shift), img->size());
+            Mat sub(result, roi);
+            img->copyTo(sub);
+            shift += img->size().height + 1;
+        }
+        imwrite("/tmp/all_fonts.png", result);
+    }
+};
+
+TEST(Highgui_Drawing, utf8_support) { CV_DrawingTest_UTF8 test; test.safe_run(); }
diff --git a/modules/imgcodecs/test/test_grfmt.cpp b/modules/imgcodecs/test/test_grfmt.cpp
index 423d030a0c..92238a95f0 100644
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@@ -87,6 +87,9 @@ TEST(Imgcodecs_imread, regression)
 {
     const char* const filenames[] =
     {
+#ifdef HAVE_JASPER
+        "Rome.jp2",
+#endif
         "color_palette_alpha.png",
         "multipage.tif",
         "rle.hdr",
@@ -99,16 +102,32 @@ TEST(Imgcodecs_imread, regression)
 
     for (size_t i = 0; i < sizeof(filenames) / sizeof(filenames[0]); ++i)
     {
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_UNCHANGED));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_GRAYSCALE));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_COLOR));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_ANYDEPTH));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_ANYCOLOR));
-        if (i != 2) // GDAL does not support hdr
-            ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_LOAD_GDAL));
+        const string path = folder + string(filenames[i]);
+        ASSERT_TRUE(imread_compare(path, IMREAD_UNCHANGED));
+        ASSERT_TRUE(imread_compare(path, IMREAD_GRAYSCALE));
+        ASSERT_TRUE(imread_compare(path, IMREAD_COLOR));
+        ASSERT_TRUE(imread_compare(path, IMREAD_ANYDEPTH));
+        ASSERT_TRUE(imread_compare(path, IMREAD_ANYCOLOR));
+        if (path.substr(path.length() - 3) != "hdr")
+        {
+            // GDAL does not support hdr
+            ASSERT_TRUE(imread_compare(path, IMREAD_LOAD_GDAL));
+        }
     }
 }
 
+#ifdef HAVE_JASPER
+TEST(Imgcodecs_jasper, regression)
+{
+    const string folder = string(cvtest::TS::ptr()->get_data_path()) + "/readwrite/";
+
+    ASSERT_TRUE(imread_compare(folder + "Bretagne2.jp2", IMREAD_COLOR));
+    ASSERT_TRUE(imread_compare(folder + "Bretagne2.jp2", IMREAD_GRAYSCALE));
+    ASSERT_TRUE(imread_compare(folder + "Grey.jp2", IMREAD_COLOR));
+    ASSERT_TRUE(imread_compare(folder + "Grey.jp2", IMREAD_GRAYSCALE));
+}
+#endif
+
 class CV_GrfmtWriteBigImageTest : public cvtest::BaseTest
 {
 public:
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 3db822db3d..ac93e45809 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3494,7 +3494,7 @@ CV_EXPORTS_W double contourArea( InputArray contour, bool oriented = false );
 
 The function calculates and returns the minimum-area bounding rectangle (possibly rotated) for a
 specified point set. See the OpenCV sample minarea.cpp . Developer should keep in mind that the
-returned rotatedRect can contain negative indices when data is close the the containing Mat element
+returned rotatedRect can contain negative indices when data is close to the containing Mat element
 boundary.
 
 @param points Input vector of 2D points, stored in std::vector\<\> or Mat
diff --git a/modules/imgproc/perf/perf_moments.cpp b/modules/imgproc/perf/perf_moments.cpp
index 9b3c5428f3..e5a9f036c9 100644
--- a/modules/imgproc/perf/perf_moments.cpp
+++ b/modules/imgproc/perf/perf_moments.cpp
@@ -34,5 +34,11 @@ PERF_TEST_P(MomentsFixture_val, Moments1,
 
     TEST_CYCLE() m = cv::moments(src, binaryImage);
 
-    SANITY_CHECK_MOMENTS(m, 1e-4, ERROR_RELATIVE);
+    int len = (int)sizeof(cv::Moments) / sizeof(double);
+    cv::Mat mat(1, len, CV_64F, (void*)&m);
+    //adding 1 to moments to avoid accidental tests fail on values close to 0
+    mat += 1;
+
+
+    SANITY_CHECK_MOMENTS(m, 2e-4, ERROR_RELATIVE);
 }
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 94aef348f0..e52312a3b8 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -1941,7 +1941,11 @@ static const int HersheyComplex[] = {
 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2223, 2084,
 2224, 2247, 587, 2249, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111,
 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126,
-2225, 2229, 2226, 2246 };
+2225, 2229, 2226, 2246, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811,
+2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826,
+2827, 2828, 2829, 2830, 2831, 2832, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909,
+2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924,
+2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932};
 
 static const int HersheyComplexItalic[] = {
 (9 + 12*16) + FONT_ITALIC_ALPHA + FONT_ITALIC_DIGIT + FONT_ITALIC_PUNCT +
@@ -2033,6 +2037,50 @@ static const int* getFontData(int fontFace)
     return ascii;
 }
 
+inline void readCheck(int &c, int &i, const String &text, int fontFace)
+{
+
+    int leftBoundary = ' ', rightBoundary = 127;
+
+    if(c >= 0x80 && fontFace == FONT_HERSHEY_COMPLEX)
+    {
+        if(c == 0xD0 && (uchar)text[i + 1] >= 0x90 && (uchar)text[i + 1] <= 0xBF)
+        {
+            c = (uchar)text[++i] - 17;
+            leftBoundary = 127;
+            rightBoundary = 175;
+        }
+        else if(c == 0xD1 && (uchar)text[i + 1] >= 0x80 && (uchar)text[i + 1] <= 0x8F)
+        {
+            c = (uchar)text[++i] + 47;
+            leftBoundary = 175;
+            rightBoundary = 191;
+        }
+        else
+        {
+            if(c >= 0xC0 && text[i+1] != 0) //2 bytes utf
+                i++;
+
+            if(c >= 0xE0 && text[i+1] != 0) //3 bytes utf
+                i++;
+
+            if(c >= 0xF0 && text[i+1] != 0) //4 bytes utf
+                i++;
+
+            if(c >= 0xF8 && text[i+1] != 0) //5 bytes utf
+                i++;
+
+            if(c >= 0xFC && text[i+1] != 0) //6 bytes utf
+                i++;
+
+            c = '?';
+        }
+    }
+
+    if(c >= rightBoundary || c < leftBoundary)
+        c = '?';
+}
+
 extern const char* g_HersheyGlyphs[];
 
 void putText( InputOutputArray _img, const String& text, Point org,
@@ -2066,8 +2114,7 @@ void putText( InputOutputArray _img, const String& text, Point org,
         int c = (uchar)text[i];
         Point p;
 
-        if( c >= 127 || c < ' ' )
-            c = '?';
+        readCheck(c, i, text, fontFace);
 
         const char* ptr = faces[ascii[(c-' ')+1]];
         p.x = (uchar)ptr[0] - 'R';
@@ -2114,8 +2161,7 @@ Size getTextSize( const String& text, int fontFace, double fontScale, int thickn
         int c = (uchar)text[i];
         Point p;
 
-        if( c >= 127 || c < ' ' )
-            c = '?';
+        readCheck(c, i, text, fontFace);
 
         const char* ptr = faces[ascii[(c-' ')+1]];
         p.x = (uchar)ptr[0] - 'R';
@@ -2183,7 +2229,10 @@ void cv::polylines(InputOutputArray _img, InputArrayOfArrays pts,
     {
         Mat p = pts.getMat(manyContours ? i : -1);
         if( p.total() == 0 )
+        {
+            npts[i] = 0;
             continue;
+        }
         CV_Assert(p.checkVector(2, CV_32S) >= 0);
         ptsptr[i] = p.ptr<Point>();
         npts[i] = p.rows*p.cols*p.channels()/2;
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index 44eb3adfc0..f2d971bea3 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1820,9 +1820,14 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op,
 #endif
 
 void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
-                       InputArray kernel, Point anchor, int iterations,
+                       InputArray _kernel, Point anchor, int iterations,
                        int borderType, const Scalar& borderValue )
 {
+    Mat kernel = _kernel.getMat();
+    if (kernel.empty())
+    {
+        kernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1));
+    }
 #ifdef HAVE_OPENCL
     Size ksize = kernel.size();
     anchor = normalizeAnchor(anchor, ksize);
diff --git a/modules/imgproc/src/opencl/gftt.cl b/modules/imgproc/src/opencl/gftt.cl
index 584ab41af2..736802b1be 100644
--- a/modules/imgproc/src/opencl/gftt.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
@@ -64,7 +64,7 @@ __kernel void maxEigenVal(__global const uchar * srcptr, int src_step, int src_o
         int src_index = mad24(id / cols, src_step, mad24((id % cols), (int)sizeof(float), src_offset));
 #ifdef HAVE_MASK
         int mask_index = mad24(id / cols, mask_step, id % cols + mask_offset);
-        if (mask[mask_index])
+        if (maskptr[mask_index])
 #endif
             maxval = max(maxval, *(__global const float *)(srcptr + src_index));
     }
diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 5e0c432d9c..205aecb018 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -446,10 +446,9 @@ cv::RotatedRect cv::fitEllipse( InputArray _points )
 
     // store angle and radii
     rp[4] = -0.5 * atan2(gfp[2], gfp[1] - gfp[0]); // convert from APP angle usage
-    t = sin(-2.0 * rp[4]);
-    if( fabs(t) > fabs(gfp[2])*min_eps )
-        t = gfp[2]/t;
-    else
+    if( fabs(gfp[2]) > min_eps )
+        t = gfp[2]/sin(-2.0 * rp[4]);
+    else // ellipse is rotated by an integer multiple of pi/2
         t = gfp[1] - gfp[0];
     rp[2] = fabs(gfp[0] + gfp[1] - t);
     if( rp[2] > min_eps )
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 90840cdaa5..dbe8a6315a 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -2770,7 +2770,7 @@ public:
         #if CV_SSE3
         int CV_DECL_ALIGNED(16) buf[4];
         float CV_DECL_ALIGNED(16) bufSum[4];
-        static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+        static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
         #endif
 
@@ -3152,7 +3152,7 @@ public:
         #if CV_SSE3
         int CV_DECL_ALIGNED(16) idxBuf[4];
         float CV_DECL_ALIGNED(16) bufSum32[4];
-        static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+        static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
         #endif
 
diff --git a/modules/imgproc/test/test_contours.cpp b/modules/imgproc/test/test_contours.cpp
index 6c5c3f0ebb..b0b8c4fbb5 100644
--- a/modules/imgproc/test/test_contours.cpp
+++ b/modules/imgproc/test/test_contours.cpp
@@ -410,4 +410,23 @@ TEST(Core_Drawing, _914)
     ASSERT_EQ( (3*rows + cols)*3 - 3*9, pixelsDrawn);
 }
 
+TEST(Core_Drawing, polylines_empty)
+{
+    Mat img(100, 100, CV_8UC1, Scalar(0));
+    vector<Point> pts; // empty
+    polylines(img, pts, false, Scalar(255));
+    int cnt = countNonZero(img);
+    ASSERT_EQ(cnt, 0);
+}
+
+TEST(Core_Drawing, polylines)
+{
+    Mat img(100, 100, CV_8UC1, Scalar(0));
+    vector<Point> pts;
+    pts.push_back(Point(0, 0));
+    pts.push_back(Point(20, 0));
+    polylines(img, pts, false, Scalar(255));
+    int cnt = countNonZero(img);
+    ASSERT_EQ(cnt, 21);
+}
 /* End of file. */
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 6b5144f924..e7b2886d37 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -1239,7 +1239,6 @@ void CV_FitEllipseTest::run_func()
         box = (CvBox2D)cv::fitEllipse(cv::cvarrToMat(points));
 }
 
-
 int CV_FitEllipseTest::validate_test_results( int test_case_idx )
 {
     int code = CV_BaseShapeDescrTest::validate_test_results( test_case_idx );
@@ -1354,6 +1353,64 @@ protected:
     }
 };
 
+
+// Regression test for incorrect fitEllipse result reported in Bug #3989
+// Check edge cases for rotation angles of ellipse ([-180, 90, 0, 90, 180] degrees)
+class CV_FitEllipseParallelTest : public CV_FitEllipseTest
+{
+public:
+    CV_FitEllipseParallelTest();
+    ~CV_FitEllipseParallelTest();
+protected:
+    void generate_point_set( void* points );
+    void run_func(void);
+    Mat pointsMat;
+};
+
+CV_FitEllipseParallelTest::CV_FitEllipseParallelTest()
+{
+    min_ellipse_size = 5;
+}
+
+void CV_FitEllipseParallelTest::generate_point_set( void* )
+{
+    RNG& rng = ts->get_rng();
+    int height = (int)(MAX(high.val[0] - low.val[0], min_ellipse_size));
+    int width = (int)(MAX(high.val[1] - low.val[1], min_ellipse_size));
+    const int angle = ( (cvtest::randInt(rng) % 5) - 2 ) * 90;
+    const int dim = max(height, width);
+    const Point center = Point(dim*2, dim*2);
+
+    if( width > height )
+    {
+        int t;
+        CV_SWAP( width, height, t );
+    }
+
+    Mat image = Mat::zeros(dim*4, dim*4, CV_8UC1);
+    ellipse(image, center, Size(height, width), angle,
+            0, 360, Scalar(255, 0, 0), 1, 8);
+
+    box0.center.x = (float)center.x;
+    box0.center.y = (float)center.y;
+    box0.size.width = (float)width*2;
+    box0.size.height = (float)height*2;
+    box0.angle = (float)angle;
+
+    vector<vector<Point> > contours;
+    findContours(image, contours,  RETR_EXTERNAL,  CHAIN_APPROX_NONE);
+    Mat(contours[0]).convertTo(pointsMat, CV_32F);
+}
+
+void CV_FitEllipseParallelTest::run_func()
+{
+    box = (CvBox2D)cv::fitEllipse(pointsMat);
+}
+
+CV_FitEllipseParallelTest::~CV_FitEllipseParallelTest(){
+    pointsMat.release();
+}
+
 /****************************************************************************************\
 *                                   FitLine Test                                         *
 \****************************************************************************************/
@@ -1377,7 +1434,7 @@ protected:
 
 CV_FitLineTest::CV_FitLineTest()
 {
-    min_log_size = 5; // for robust ellipse fitting a dozen of points is needed at least
+    min_log_size = 5; // for robust line fitting a dozen of points is needed at least
     max_log_size = 10;
     max_noise = 0.05;
 }
@@ -1866,6 +1923,7 @@ TEST(Imgproc_MinTriangle, accuracy) { CV_MinTriangleTest test; test.safe_run();
 TEST(Imgproc_MinCircle, accuracy) { CV_MinCircleTest test; test.safe_run(); }
 TEST(Imgproc_ContourPerimeter, accuracy) { CV_PerimeterTest test; test.safe_run(); }
 TEST(Imgproc_FitEllipse, accuracy) { CV_FitEllipseTest test; test.safe_run(); }
+TEST(Imgproc_FitEllipse, parallel) { CV_FitEllipseParallelTest test; test.safe_run(); }
 TEST(Imgproc_FitLine, accuracy) { CV_FitLineTest test; test.safe_run(); }
 TEST(Imgproc_ContourMoments, accuracy) { CV_ContourMomentsTest test; test.safe_run(); }
 TEST(Imgproc_ContourPerimeterSlice, accuracy) { CV_PerimeterAreaSliceTest test; test.safe_run(); }
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 38f410e64c..ce24daf79f 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -10,6 +10,7 @@ endif()
 set(the_description "The java bindings")
 ocv_add_module(java BINDINGS opencv_core opencv_imgproc)
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/generator/src/cpp")
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/include")
 
 if(NOT ANDROID)
   include_directories(${JNI_INCLUDE_DIRS})
@@ -173,8 +174,8 @@ endforeach()
 file(REMOVE_RECURSE "${probe_dir}")
 
 if(NOT ANDROID)
-  ocv_list_filterout(handwritten_java_sources "/(engine|android)\\\\+")
-  ocv_list_filterout(handwritten_aidl_sources "/(engine|android)\\\\+")
+  ocv_list_filterout(handwritten_java_sources "/(engine3|android)\\\\+")
+  ocv_list_filterout(handwritten_aidl_sources "/(engine3|android)\\\\+")
 else()
   file(GLOB_RECURSE handwrittren_lib_project_files_rel RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/" "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/*")
   list(REMOVE_ITEM handwrittren_lib_project_files_rel "${ANDROID_MANIFEST_FILE}")
diff --git a/modules/java/generator/src/java/android+AsyncServiceHelper.java b/modules/java/generator/src/java/android+AsyncServiceHelper.java
index 568f3da170..9882d60b58 100644
--- a/modules/java/generator/src/java/android+AsyncServiceHelper.java
+++ b/modules/java/generator/src/java/android+AsyncServiceHelper.java
@@ -4,7 +4,7 @@ import java.io.File;
 import java.util.StringTokenizer;
 
 import org.opencv.core.Core;
-import org.opencv.engine.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineInterface;
 
 import android.content.ComponentName;
 import android.content.Context;
@@ -21,8 +21,9 @@ class AsyncServiceHelper
             final LoaderCallbackInterface Callback)
     {
         AsyncServiceHelper helper = new AsyncServiceHelper(Version, AppContext, Callback);
-        if (AppContext.bindService(new Intent("org.opencv.engine.BIND"),
-                helper.mServiceConnection, Context.BIND_AUTO_CREATE))
+        Intent intent = new Intent("org.opencv.engine3.BIND");
+        intent.setPackage("org.opencv.engine3");
+        if (AppContext.bindService(intent, helper.mServiceConnection, Context.BIND_AUTO_CREATE))
         {
             return true;
         }
@@ -76,7 +77,7 @@ class AsyncServiceHelper
                 private LoaderCallbackInterface mUserAppCallback = Callback;
                 public String getPackageName()
                 {
-                    return "OpenCV Manager";
+                    return "OpenCV3 Manager";
                 }
                 public void install() {
                     Log.d(TAG, "Trying to install OpenCV Manager via Google Play");
@@ -122,7 +123,7 @@ class AsyncServiceHelper
                 private LoaderCallbackInterface mUserAppCallback = Callback;
                 public String getPackageName()
                 {
-                    return "OpenCV Manager";
+                    return "OpenCV3 Manager";
                 }
                 public void install()
                 {
@@ -150,7 +151,7 @@ class AsyncServiceHelper
     /**
      *  URL of OpenCV Manager page on Google Play Market.
      */
-    protected static final String OPEN_CV_SERVICE_URL = "market://details?id=org.opencv.engine";
+    protected static final String OPEN_CV_SERVICE_URL = "market://details?id=org.opencv.engine3";
 
     protected ServiceConnection mServiceConnection = new ServiceConnection()
     {
diff --git a/modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl b/modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
similarity index 97%
rename from modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl
rename to modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
index 21fe5f716b..b84eaaafb0 100644
--- a/modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl
+++ b/modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 /**
 * Class provides a Java interface for OpenCV Engine Service. It's synchronous with native OpenCVEngine class.
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index a611583aef..19e26d3bbc 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -297,11 +297,12 @@ public:
         COMPRESSED_INPUT=2,
         PREPROCESSED_INPUT=4
     };
-    CV_WRAP virtual void clear();
 
     /** @brief Returns the number of variables in training samples */
     CV_WRAP virtual int getVarCount() const = 0;
 
+    CV_WRAP virtual bool empty() const;
+
     /** @brief Returns true if the model is trained */
     CV_WRAP virtual bool isTrained() const = 0;
     /** @brief Returns true if the model is classifier */
@@ -347,40 +348,6 @@ public:
      */
     CV_WRAP virtual float predict( InputArray samples, OutputArray results=noArray(), int flags=0 ) const = 0;
 
-    /** @brief Loads model from the file
-
-    This is static template method of StatModel. It's usage is following (in the case of SVM):
-    @code
-    Ptr<SVM> svm = StatModel::load<SVM>("my_svm_model.xml");
-    @endcode
-    In order to make this method work, the derived class must overwrite Algorithm::read(const
-    FileNode& fn).
-     */
-    template<typename _Tp> static Ptr<_Tp> load(const String& filename)
-    {
-        FileStorage fs(filename, FileStorage::READ);
-        Ptr<_Tp> model = _Tp::create();
-        model->read(fs.getFirstTopLevelNode());
-        return model->isTrained() ? model : Ptr<_Tp>();
-    }
-
-    /** @brief Loads model from a String
-
-    @param strModel The string variable containing the model you want to load.
-
-    This is static template method of StatModel. It's usage is following (in the case of SVM):
-    @code
-    Ptr<SVM> svm = StatModel::loadFromString<SVM>(myStringModel);
-    @endcode
-     */
-    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel)
-    {
-        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
-        Ptr<_Tp> model = _Tp::create();
-        model->read(fs.getFirstTopLevelNode());
-        return model->isTrained() ? model : Ptr<_Tp>();
-    }
-
     /** @brief Create and train model with default parameters
 
     The class must implement static `create()` method with no parameters or with all default parameter values
@@ -390,14 +357,6 @@ public:
         Ptr<_Tp> model = _Tp::create();
         return !model.empty() && model->train(data, flags) ? model : Ptr<_Tp>();
     }
-
-    /** Saves the model to a file.
-    In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
-    CV_WRAP virtual void save(const String& filename) const;
-
-    /** Returns model string identifier.
-    This string is used as top level xml/yml node tag when model is saved to a file or string. */
-    CV_WRAP virtual String getDefaultModelName() const = 0;
 };
 
 /****************************************************************************************\
@@ -939,7 +898,7 @@ public:
 
     /** Creates empty %EM model.
     The model should be trained then using StatModel::train(traindata, flags) method. Alternatively, you
-    can use one of the EM::train\* methods or load it from file using StatModel::load\<EM\>(filename).
+    can use one of the EM::train\* methods or load it from file using Algorithm::load\<EM\>(filename).
      */
     CV_WRAP static Ptr<EM> create();
 };
@@ -1127,7 +1086,7 @@ public:
 
     The static method creates empty decision tree with the specified parameters. It should be then
     trained using train method (see StatModel::train). Alternatively, you can load the model from
-    file using StatModel::load\<DTrees\>(filename).
+    file using Algorithm::load\<DTrees\>(filename).
      */
     CV_WRAP static Ptr<DTrees> create();
 };
@@ -1181,7 +1140,7 @@ public:
 
     /** Creates the empty model.
     Use StatModel::train to train the model, StatModel::train to create and train the model,
-    StatModel::load to load the pre-trained model.
+    Algorithm::load to load the pre-trained model.
      */
     CV_WRAP static Ptr<RTrees> create();
 };
@@ -1231,7 +1190,7 @@ public:
     };
 
     /** Creates the empty model.
-    Use StatModel::train to train the model, StatModel::load\<Boost\>(filename) to load the pre-trained model. */
+    Use StatModel::train to train the model, Algorithm::load\<Boost\>(filename) to load the pre-trained model. */
     CV_WRAP static Ptr<Boost> create();
 };
 
@@ -1416,7 +1375,7 @@ public:
 
     /** @brief Creates empty model
 
-    Use StatModel::train to train the model, StatModel::load\<ANN_MLP\>(filename) to load the pre-trained model.
+    Use StatModel::train to train the model, Algorithm::load\<ANN_MLP\>(filename) to load the pre-trained model.
     Note that the train method has optional flags: ANN_MLP::TrainFlags.
      */
     static Ptr<ANN_MLP> create();
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 2b29519cef..89dfa38444 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -1294,7 +1294,7 @@ public:
         return layer_sizes.empty() ? 0 : layer_sizes[0];
     }
 
-    String getDefaultModelName() const
+    String getDefaultName() const
     {
         return "opencv_ml_ann_mlp";
     }
diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp
index 5694ff1051..d7b6c61dd4 100644
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -465,7 +465,7 @@ public:
     CV_WRAP_SAME_PROPERTY(float, RegressionAccuracy, impl.params)
     CV_WRAP_SAME_PROPERTY_S(cv::Mat, Priors, impl.params)
 
-    String getDefaultModelName() const { return "opencv_ml_boost"; }
+    String getDefaultName() const { return "opencv_ml_boost"; }
 
     bool train( const Ptr<TrainData>& trainData, int flags )
     {
diff --git a/modules/ml/src/em.cpp b/modules/ml/src/em.cpp
index c84be84b9c..59c1352e42 100644
--- a/modules/ml/src/em.cpp
+++ b/modules/ml/src/em.cpp
@@ -227,7 +227,7 @@ public:
         return means.cols;
     }
 
-    String getDefaultModelName() const
+    String getDefaultName() const
     {
         return "opencv_ml_em";
     }
diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp
index 561abbaeb8..52f368874d 100644
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@@ -50,7 +50,7 @@ ParamGrid::ParamGrid(double _minVal, double _maxVal, double _logStep)
     logStep = std::max(_logStep, 1.);
 }
 
-void StatModel::clear() {}
+bool StatModel::empty() const { return !isTrained(); }
 
 int StatModel::getVarCount() const { return 0; }
 
@@ -111,15 +111,6 @@ float StatModel::calcError( const Ptr<TrainData>& data, bool testerr, OutputArra
     return (float)(err / n * (isclassifier ? 100 : 1));
 }
 
-void StatModel::save(const String& filename) const
-{
-    FileStorage fs(filename, FileStorage::WRITE);
-    fs << getDefaultModelName() << "{";
-    fs << "format" << (int)3;
-    write(fs);
-    fs << "}";
-}
-
 /* Calculates upper triangular matrix S, where A is a symmetrical matrix A=S'*S */
 static void Cholesky( const Mat& A, Mat& S )
 {
diff --git a/modules/ml/src/knearest.cpp b/modules/ml/src/knearest.cpp
index 70e178e6e2..99477cd12d 100644
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@@ -496,7 +496,7 @@ public:
         return impl->train(data, flags);
     }
 
-    String getDefaultModelName() const { return impl->getModelName(); }
+    String getDefaultName() const { return impl->getModelName(); }
 
 protected:
     void initImpl(int algorithmType)
diff --git a/modules/ml/src/lr.cpp b/modules/ml/src/lr.cpp
index 5a057a205f..24fc29f67c 100644
--- a/modules/ml/src/lr.cpp
+++ b/modules/ml/src/lr.cpp
@@ -104,7 +104,7 @@ public:
     virtual int getVarCount() const { return learnt_thetas.cols; }
     virtual bool isTrained() const { return !learnt_thetas.empty(); }
     virtual bool isClassifier() const { return true; }
-    virtual String getDefaultModelName() const { return "opencv_ml_lr"; }
+    virtual String getDefaultName() const { return "opencv_ml_lr"; }
 protected:
     Mat calc_sigmoid(const Mat& data) const;
     double compute_cost(const Mat& _data, const Mat& _labels, const Mat& _init_theta);
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index 9fc0d833ba..5ca74acd91 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -443,7 +443,7 @@ public:
     bool isTrained() const { return !avg.empty(); }
     bool isClassifier() const { return true; }
     int getVarCount() const { return nallvars; }
-    String getDefaultModelName() const { return "opencv_ml_nbayes"; }
+    String getDefaultName() const { return "opencv_ml_nbayes"; }
 
     int nallvars;
     Mat var_idx, cls_labels, c;
diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp
index 77700a05a2..84821988b6 100644
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -290,7 +290,7 @@ namespace ml
         virtual ~DTreesImpl();
         virtual void clear();
 
-        String getDefaultModelName() const { return "opencv_ml_dtree"; }
+        String getDefaultName() const { return "opencv_ml_dtree"; }
         bool isTrained() const { return !roots.empty(); }
         bool isClassifier() const { return _isClassifier; }
         int getVarCount() const { return varType.empty() ? 0 : (int)(varType.size() - 1); }
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index f5e2b21bdb..4da34992dd 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -375,7 +375,7 @@ public:
     RTreesImpl() {}
     virtual ~RTreesImpl() {}
 
-    String getDefaultModelName() const { return "opencv_ml_rtrees"; }
+    String getDefaultName() const { return "opencv_ml_rtrees"; }
 
     bool train( const Ptr<TrainData>& trainData, int flags )
     {
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 8bed117639..93180856ec 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -538,6 +538,8 @@ public:
                 {
                     kr.idx = cache_size;
                     cache_size++;
+                    if (!lru_last)
+                        lru_last = i1+1;
                 }
                 else
                 {
@@ -546,6 +548,8 @@ public:
                     last.idx = -1;
                     lru_cache[last.prev].next = 0;
                     lru_last = last.prev;
+                    last.prev = 0;
+                    last.next = 0;
                 }
                 kernel->calc( sample_count, var_count, samples.ptr<float>(),
                               samples.ptr<float>(i1), lru_cache_data.ptr<Qfloat>(kr.idx) );
@@ -561,6 +565,8 @@ public:
                 else
                     lru_first = kr.next;
             }
+            if (lru_first)
+                lru_cache[lru_first].prev = i1+1;
             kr.next = lru_first;
             kr.prev = 0;
             lru_first = i1+1;
@@ -1669,13 +1675,13 @@ public:
         Mat samples = data->getTrainSamples();
         Mat responses;
         bool is_classification = false;
-        Mat class_labels0 = class_labels;
         int class_count = (int)class_labels.total();
 
         if( svmType == C_SVC || svmType == NU_SVC )
         {
             responses = data->getTrainNormCatResponses();
             class_labels = data->getClassLabels();
+            class_count = (int)class_labels.total();
             is_classification = true;
 
             vector<int> temp_class_labels;
@@ -1755,8 +1761,9 @@ public:
         Mat temp_train_responses(train_sample_count, 1, rtype);
         Mat temp_test_responses;
 
+        // If grid.minVal == grid.maxVal, this will allow one and only one pass through the loop with params.var = grid.minVal.
         #define FOR_IN_GRID(var, grid) \
-            for( params.var = grid.minVal; params.var == grid.minVal || params.var < grid.maxVal; params.var *= grid.logStep )
+            for( params.var = grid.minVal; params.var == grid.minVal || params.var < grid.maxVal; params.var = (grid.minVal == grid.maxVal) ? grid.maxVal + 1 : params.var * grid.logStep )
 
         FOR_IN_GRID(C, C_grid)
         FOR_IN_GRID(gamma, gamma_grid)
@@ -1786,7 +1793,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-                for( i = 0; i < test_sample_count; i++ )
+                for( i = 0; i < train_sample_count; i++ )
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);
@@ -1814,7 +1821,6 @@ public:
         }
 
         params = best_params;
-        class_labels = class_labels0;
         return do_train( samples, responses );
     }
 
@@ -2008,7 +2014,7 @@ public:
         return var_count;
     }
 
-    String getDefaultModelName() const
+    String getDefaultName() const
     {
         return "opencv_ml_svm";
     }
diff --git a/modules/ml/test/test_emknearestkmeans.cpp b/modules/ml/test/test_emknearestkmeans.cpp
index a079be22f2..65371755ab 100644
--- a/modules/ml/test/test_emknearestkmeans.cpp
+++ b/modules/ml/test/test_emknearestkmeans.cpp
@@ -576,7 +576,7 @@ protected:
         // Read in
         try
         {
-            em = StatModel::load<EM>(filename);
+            em = Algorithm::load<EM>(filename);
         }
         catch(...)
         {
diff --git a/modules/ml/test/test_lr.cpp b/modules/ml/test/test_lr.cpp
index e0da01cfb9..1bca9051df 100644
--- a/modules/ml/test/test_lr.cpp
+++ b/modules/ml/test/test_lr.cpp
@@ -179,7 +179,7 @@ void CV_LRTest_SaveLoad::run( int /*start_from*/ )
     // and load to another
     try
     {
-        Ptr<LogisticRegression> lr2 = StatModel::load<LogisticRegression>(filename);
+        Ptr<LogisticRegression> lr2 = Algorithm::load<LogisticRegression>(filename);
         lr2->predict(tdata->getSamples(), responses2);
         learnt_mat2 = lr2->get_learnt_thetas();
     }
diff --git a/modules/ml/test/test_mltests2.cpp b/modules/ml/test/test_mltests2.cpp
index cfaf0f2491..919fae6ce4 100644
--- a/modules/ml/test/test_mltests2.cpp
+++ b/modules/ml/test/test_mltests2.cpp
@@ -472,19 +472,19 @@ void CV_MLBaseTest::save( const char* filename )
 void CV_MLBaseTest::load( const char* filename )
 {
     if( modelName == CV_NBAYES )
-        model = StatModel::load<NormalBayesClassifier>( filename );
+        model = Algorithm::load<NormalBayesClassifier>( filename );
     else if( modelName == CV_KNEAREST )
-        model = StatModel::load<KNearest>( filename );
+        model = Algorithm::load<KNearest>( filename );
     else if( modelName == CV_SVM )
-        model = StatModel::load<SVM>( filename );
+        model = Algorithm::load<SVM>( filename );
     else if( modelName == CV_ANN )
-        model = StatModel::load<ANN_MLP>( filename );
+        model = Algorithm::load<ANN_MLP>( filename );
     else if( modelName == CV_DTREE )
-        model = StatModel::load<DTrees>( filename );
+        model = Algorithm::load<DTrees>( filename );
     else if( modelName == CV_BOOST )
-        model = StatModel::load<Boost>( filename );
+        model = Algorithm::load<Boost>( filename );
     else if( modelName == CV_RTREES )
-        model = StatModel::load<RTrees>( filename );
+        model = Algorithm::load<RTrees>( filename );
     else
         CV_Error( CV_StsNotImplemented, "invalid stat model name");
 }
diff --git a/modules/ml/test/test_save_load.cpp b/modules/ml/test/test_save_load.cpp
index 606079b818..2d6f144bb9 100644
--- a/modules/ml/test/test_save_load.cpp
+++ b/modules/ml/test/test_save_load.cpp
@@ -190,17 +190,17 @@ protected:
         bool isTree = modelName == CV_BOOST || modelName == CV_DTREE || modelName == CV_RTREES;
         Ptr<StatModel> model;
         if (modelName == CV_BOOST)
-            model = StatModel::load<Boost>(filename);
+            model = Algorithm::load<Boost>(filename);
         else if (modelName == CV_ANN)
-            model = StatModel::load<ANN_MLP>(filename);
+            model = Algorithm::load<ANN_MLP>(filename);
         else if (modelName == CV_DTREE)
-            model = StatModel::load<DTrees>(filename);
+            model = Algorithm::load<DTrees>(filename);
         else if (modelName == CV_NBAYES)
-            model = StatModel::load<NormalBayesClassifier>(filename);
+            model = Algorithm::load<NormalBayesClassifier>(filename);
         else if (modelName == CV_SVM)
-            model = StatModel::load<SVM>(filename);
+            model = Algorithm::load<SVM>(filename);
         else if (modelName == CV_RTREES)
-            model = StatModel::load<RTrees>(filename);
+            model = Algorithm::load<RTrees>(filename);
         if (!model)
         {
             code = cvtest::TS::FAIL_INVALID_TEST_DATA;
@@ -273,11 +273,11 @@ TEST(DISABLED_ML_SVM, linear_save_load)
 {
     Ptr<cv::ml::SVM> svm1, svm2, svm3;
 
-    svm1 = StatModel::load<SVM>("SVM45_X_38-1.xml");
-    svm2 = StatModel::load<SVM>("SVM45_X_38-2.xml");
+    svm1 = Algorithm::load<SVM>("SVM45_X_38-1.xml");
+    svm2 = Algorithm::load<SVM>("SVM45_X_38-2.xml");
     string tname = tempfile("a.xml");
     svm2->save(tname);
-    svm3 = StatModel::load<SVM>(tname);
+    svm3 = Algorithm::load<SVM>(tname);
 
     ASSERT_EQ(svm1->getVarCount(), svm2->getVarCount());
     ASSERT_EQ(svm1->getVarCount(), svm3->getVarCount());
diff --git a/modules/ml/test/test_svmtrainauto.cpp b/modules/ml/test/test_svmtrainauto.cpp
new file mode 100644
index 0000000000..918d2b7117
--- /dev/null
+++ b/modules/ml/test/test_svmtrainauto.cpp
@@ -0,0 +1,89 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace std;
+using cv::ml::SVM;
+using cv::ml::TrainData;
+
+//--------------------------------------------------------------------------------------------
+class CV_SVMTrainAutoTest : public cvtest::BaseTest {
+public:
+    CV_SVMTrainAutoTest() {}
+protected:
+    virtual void run( int start_from );
+};
+
+void CV_SVMTrainAutoTest::run( int /*start_from*/ )
+{
+    int datasize = 100;
+    cv::Mat samples = cv::Mat::zeros( datasize, 2, CV_32FC1 );
+    cv::Mat responses = cv::Mat::zeros( datasize, 1, CV_32S );
+
+    RNG rng(0);
+    for (int i = 0; i < datasize; ++i)
+    {
+        int response = rng.uniform(0, 2);  // Random from {0, 1}.
+        samples.at<float>( i, 0 ) = rng.uniform(0.f, 0.5f) + response * 0.5f;
+        samples.at<float>( i, 1 ) = rng.uniform(0.f, 0.5f) + response * 0.5f;
+        responses.at<int>( i, 0 ) = response;
+    }
+
+    cv::Ptr<TrainData> data = TrainData::create( samples, cv::ml::ROW_SAMPLE, responses );
+    cv::Ptr<SVM> svm = SVM::create();
+    svm->trainAuto( data, 10 );  // 2-fold cross validation.
+
+    float test_data0[2] = {0.25f, 0.25f};
+    cv::Mat test_point0 = cv::Mat( 1, 2, CV_32FC1, test_data0 );
+    float result0 = svm->predict( test_point0 );
+    float test_data1[2] = {0.75f, 0.75f};
+    cv::Mat test_point1 = cv::Mat( 1, 2, CV_32FC1, test_data1 );
+    float result1 = svm->predict( test_point1 );
+
+    if ( fabs( result0 - 0 ) > 0.001 || fabs( result1 - 1 ) > 0.001 )
+    {
+        ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
+    }
+}
+
+TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); }
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 46a1d60f2d..bc1fedc80b 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -618,8 +618,7 @@ Ptr<FeatureEvaluator> HaarEvaluator::clone() const
 void HaarEvaluator::computeChannels(int scaleIdx, InputArray img)
 {
     const ScaleData& s = scaleData->at(scaleIdx);
-    tofs = (int)sbufSize.area();
-    sqofs = hasTiltedFeatures ? tofs*2 : tofs;
+    sqofs = hasTiltedFeatures ? sbufSize.area() * 2 : sbufSize.area();
 
     if (img.isUMat())
     {
@@ -660,6 +659,9 @@ void HaarEvaluator::computeChannels(int scaleIdx, InputArray img)
 
 void HaarEvaluator::computeOptFeatures()
 {
+    if (hasTiltedFeatures)
+        tofs = sbufSize.area();
+
     int sstep = sbufSize.width;
     CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sstep );
 
diff --git a/modules/photo/src/arrays.hpp b/modules/photo/src/arrays.hpp
index 4aec5f7a1e..cdd59a3286 100644
--- a/modules/photo/src/arrays.hpp
+++ b/modules/photo/src/arrays.hpp
@@ -44,6 +44,9 @@
 #ifndef __OPENCV_DENOISING_ARRAYS_HPP__
 #define __OPENCV_DENOISING_ARRAYS_HPP__
 
+namespace cv
+{
+
 template <class T>
 struct Array2d
 {
@@ -176,4 +179,6 @@ struct Array4d
     }
 };
 
+}
+
 #endif
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index ff53d733c4..974545994b 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -109,12 +109,18 @@ typedef std::vector<std::vector<Point2f> > vector_vector_Point2f;
 typedef std::vector<std::vector<Point3f> > vector_vector_Point3f;
 typedef std::vector<std::vector<DMatch> > vector_vector_DMatch;
 
+#ifdef HAVE_OPENCV_FEATURES2D
 typedef SimpleBlobDetector::Params SimpleBlobDetector_Params;
+#endif
 
+#ifdef HAVE_OPENCV_FLANN
 typedef cvflann::flann_distance_t cvflann_flann_distance_t;
 typedef cvflann::flann_algorithm_t cvflann_flann_algorithm_t;
+#endif
 
+#ifdef HAVE_OPENCV_STITCHING
 typedef Stitcher::Status Status;
+#endif
 
 static PyObject* failmsgp(const char *fmt, ...)
 {
@@ -220,7 +226,7 @@ static bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo info)
 
     if( PyInt_Check(o) )
     {
-        double v[] = {(double)PyInt_AsLong((PyObject*)o), 0., 0., 0.};
+        double v[] = {static_cast<double>(PyInt_AsLong((PyObject*)o)), 0., 0., 0.};
         m = Mat(4, 1, CV_64F, v).clone();
         return true;
     }
@@ -445,11 +451,13 @@ PyObject* pyopencv_from(const bool& value)
     return PyBool_FromLong(value);
 }
 
+#ifdef HAVE_OPENCV_STITCHING
 template<>
 PyObject* pyopencv_from(const Status& value)
 {
     return PyInt_FromLong(value);
 }
+#endif
 
 template<>
 bool pyopencv_to(PyObject* obj, bool& value, const char* name)
@@ -486,6 +494,7 @@ PyObject* pyopencv_from(const int& value)
     return PyInt_FromLong(value);
 }
 
+#ifdef HAVE_OPENCV_FLANN
 template<>
 PyObject* pyopencv_from(const cvflann_flann_algorithm_t& value)
 {
@@ -497,6 +506,7 @@ PyObject* pyopencv_from(const cvflann_flann_distance_t& value)
 {
     return PyInt_FromLong(int(value));
 }
+#endif
 
 template<>
 bool pyopencv_to(PyObject* obj, int& value, const char* name)
@@ -1004,6 +1014,7 @@ PyObject* pyopencv_from(const Moments& m)
                          "nu30", m.nu30, "nu21", m.nu21, "nu12", m.nu12, "nu03", m.nu03);
 }
 
+#ifdef HAVE_OPENCV_FLANN
 template<>
 bool pyopencv_to(PyObject *o, cv::flann::IndexParams& p, const char *name)
 {
@@ -1057,6 +1068,7 @@ bool pyopencv_to(PyObject* obj, cv::flann::SearchParams & value, const char * na
 {
     return pyopencv_to<cv::flann::IndexParams>(obj, value, name);
 }
+#endif
 
 template <typename T>
 bool pyopencv_to(PyObject *o, Ptr<T>& p, const char *name)
@@ -1065,6 +1077,7 @@ bool pyopencv_to(PyObject *o, Ptr<T>& p, const char *name)
     return pyopencv_to(o, *p, name);
 }
 
+#ifdef HAVE_OPENCV_FLANN
 template<>
 bool pyopencv_to(PyObject *o, cvflann::flann_distance_t& dist, const char *name)
 {
@@ -1073,6 +1086,7 @@ bool pyopencv_to(PyObject *o, cvflann::flann_distance_t& dist, const char *name)
     dist = (cvflann::flann_distance_t)d;
     return ok;
 }
+#endif
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1120,6 +1134,7 @@ static void OnMouse(int event, int x, int y, int flags, void* param)
     PyGILState_Release(gstate);
 }
 
+#ifdef HAVE_OPENCV_HIGHGUI
 static PyObject *pycvSetMouseCallback(PyObject*, PyObject *args, PyObject *kw)
 {
     const char *keywords[] = { "window_name", "on_mouse", "param", NULL };
@@ -1139,6 +1154,7 @@ static PyObject *pycvSetMouseCallback(PyObject*, PyObject *args, PyObject *kw)
     ERRWRAP2(setMouseCallback(name, OnMouse, Py_BuildValue("OO", on_mouse, param)));
     Py_RETURN_NONE;
 }
+#endif
 
 static void OnChange(int pos, void *param)
 {
@@ -1154,6 +1170,7 @@ static void OnChange(int pos, void *param)
     PyGILState_Release(gstate);
 }
 
+#ifdef HAVE_OPENCV_HIGHGUI
 static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
 {
     PyObject *on_change;
@@ -1171,6 +1188,7 @@ static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
     ERRWRAP2(createTrackbar(trackbar_name, window_name, value, count, OnChange, Py_BuildValue("OO", on_change, Py_None)));
     Py_RETURN_NONE;
 }
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////
 
@@ -1200,8 +1218,10 @@ static int convert_to_char(PyObject *o, char *dst, const char *name = "no_name")
 #include "pyopencv_generated_funcs.h"
 
 static PyMethodDef special_methods[] = {
+#ifdef HAVE_OPENCV_HIGHGUI
   {"createTrackbar", pycvCreateTrackbar, METH_VARARGS, "createTrackbar(trackbarName, windowName, value, count, onChange) -> None"},
   {"setMouseCallback", (PyCFunction)pycvSetMouseCallback, METH_VARARGS | METH_KEYWORDS, "setMouseCallback(windowName, onMouse [, param]) -> None"},
+#endif
   {NULL, NULL},
 };
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
index 37029215e3..4ff22c40c0 100644
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -106,6 +106,8 @@ protected:
 class CV_EXPORTS VoronoiSeamFinder : public PairwiseSeamFinder
 {
 public:
+    virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks);
     virtual void find(const std::vector<Size> &size, const std::vector<Point> &corners,
                       std::vector<UMat> &masks);
 private:
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 19dff8e1f0..ee8e824cbf 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -186,14 +186,18 @@ public:
      */
     PlaneWarper(float scale = 1.f) { projector_.scale = scale; }
 
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
     Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
 
     virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
     Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
 
+    Point warp(InputArray src, InputArray K, InputArray R,
+               int interp_mode, int border_mode, OutputArray dst);
     virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
                OutputArray dst);
 
+    Rect warpRoi(Size src_size, InputArray K, InputArray R);
     Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
 
 protected:
diff --git a/modules/stitching/src/autocalib.cpp b/modules/stitching/src/autocalib.cpp
index 56a9df57b8..91244bde15 100644
--- a/modules/stitching/src/autocalib.cpp
+++ b/modules/stitching/src/autocalib.cpp
@@ -49,7 +49,7 @@ namespace {
 template<typename _Tp> static inline bool
 decomposeCholesky(_Tp* A, size_t astep, int m)
 {
-    if (!Cholesky(A, astep, m, 0, 0, 0))
+    if (!hal::Cholesky(A, astep, m, 0, 0, 0))
         return false;
     astep /= sizeof(A[0]);
     for (int i = 0; i < m; ++i)
diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp
index 8a673ede0c..fc91135fa6 100644
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -82,6 +82,11 @@ void PairwiseSeamFinder::run()
     }
 }
 
+void VoronoiSeamFinder::find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                             std::vector<UMat> &masks)
+{
+    PairwiseSeamFinder::find(src, corners, masks);
+}
 
 void VoronoiSeamFinder::find(const std::vector<Size> &sizes, const std::vector<Point> &corners,
                              std::vector<UMat> &masks)
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 4b6185f4e6..141fdec357 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -87,6 +87,13 @@ Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R, In
     return uv;
 }
 
+Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R)
+{
+    float tz[] = {0.f, 0.f, 0.f};
+    Mat_<float> T(3, 1, tz);
+    return warpPoint(pt, K, R, T);
+}
+
 Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
 {
     return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32FC1), xmap, ymap);
@@ -155,6 +162,13 @@ Point PlaneWarper::warp(InputArray src, InputArray K, InputArray R, InputArray T
     return dst_roi.tl();
 }
 
+Point PlaneWarper::warp(InputArray src, InputArray K, InputArray R,
+                        int interp_mode, int border_mode, OutputArray dst)
+{
+    float tz[] = {0.f, 0.f, 0.f};
+    Mat_<float> T(3, 1, tz);
+    return warp(src, K, R, T, interp_mode, border_mode, dst);
+}
 
 Rect PlaneWarper::warpRoi(Size src_size, InputArray K, InputArray R, InputArray T)
 {
@@ -166,6 +180,13 @@ Rect PlaneWarper::warpRoi(Size src_size, InputArray K, InputArray R, InputArray
     return Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1));
 }
 
+Rect PlaneWarper::warpRoi(Size src_size, InputArray K, InputArray R)
+{
+    float tz[] = {0.f, 0.f, 0.f};
+    Mat_<float> T(3, 1, tz);
+    return warpRoi(src_size, K, R, T);
+}
+
 
 void PlaneWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br)
 {
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
index 216e869c14..f7b9685ab0 100644
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@@ -126,7 +126,7 @@ namespace
         else
         {
             // should never get here
-            CV_Assert(0);
+            CV_Error(Error::StsBadArg, "Failed to detect input frame kind" );
         }
     }
 
diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
index df6725b72b..25a10af3f8 100644
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -224,7 +224,7 @@ namespace
 
     void Farneback::impl(InputArray input0, InputArray input1, OutputArray dst)
     {
-        calcOpticalFlowFarneback(input0, input1, (InputOutputArray)dst, pyrScale_,
+        calcOpticalFlowFarneback(input0, input1, InputOutputArray(dst), pyrScale_,
                                  numLevels_, winSize_, numIters_,
                                  polyN_, polySigma_, flags_);
     }
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index f2eae265ac..f5ba1d81e6 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -438,9 +438,9 @@ static int countViolations(const cv::Mat& expected, const cv::Mat& actual, const
 
     if (v > 0 && max_violation != 0 && max_allowed != 0)
     {
-        int loc[10];
+        int loc[10] = {0};
         cv::minMaxIdx(maximum, 0, max_allowed, 0, loc, mask);
-        *max_violation = diff64f.at<double>(loc[1], loc[0]);
+        *max_violation = diff64f.at<double>(loc[0], loc[1]);
     }
 
     return v;
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index ae958bcee8..b0245b1df0 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -528,27 +528,27 @@ public:
     /** @brief Sets a property in the VideoCapture.
 
     @param propId Property identifier. It can be one of the following:
-     -   **CV_CAP_PROP_POS_MSEC** Current position of the video file in milliseconds.
-     -   **CV_CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CV_CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
+     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds.
+     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
+     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
          film, 1 - end of the film.
-     -   **CV_CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CV_CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CV_CAP_PROP_FPS** Frame rate.
-     -   **CV_CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CV_CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CV_CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CV_CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CV_CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CV_CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CV_CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CV_CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CV_CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CV_CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CV_CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
+     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
+     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
+     -   **CAP_PROP_FPS** Frame rate.
+     -   **CAP_PROP_FOURCC** 4-character code of codec.
+     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
+     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
+     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
+     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
+     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
+     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
+     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
+     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
+     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
+     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
          to RGB.
-     -   **CV_CAP_PROP_WHITE_BALANCE** Currently unsupported
-     -   **CV_CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
+     -   **CAP_PROP_WHITE_BALANCE** Currently unsupported
+     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
          by DC1394 v 2.x backend currently)
     @param value Value of the property.
      */
@@ -557,31 +557,31 @@ public:
     /** @brief Returns the specified VideoCapture property
 
     @param propId Property identifier. It can be one of the following:
-     -   **CV_CAP_PROP_POS_MSEC** Current position of the video file in milliseconds or video
+     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds or video
          capture timestamp.
-     -   **CV_CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CV_CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
+     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
+     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
          film, 1 - end of the film.
-     -   **CV_CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CV_CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CV_CAP_PROP_FPS** Frame rate.
-     -   **CV_CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CV_CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CV_CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CV_CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CV_CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CV_CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CV_CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CV_CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CV_CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CV_CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CV_CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
+     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
+     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
+     -   **CAP_PROP_FPS** Frame rate.
+     -   **CAP_PROP_FOURCC** 4-character code of codec.
+     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
+     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
+     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
+     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
+     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
+     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
+     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
+     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
+     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
+     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
          to RGB.
-     -   **CV_CAP_PROP_WHITE_BALANCE** Currently not supported
-     -   **CV_CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
+     -   **CAP_PROP_WHITE_BALANCE** Currently not supported
+     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
          by DC1394 v 2.x backend currently)
 
-    **Note**: When querying a property that is not supported by the backend used by the VideoCapture
+    @note When querying a property that is not supported by the backend used by the VideoCapture
     class, value 0 is returned.
      */
     CV_WRAP virtual double get(int propId) const;
@@ -659,7 +659,7 @@ public:
      -   **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
      -   **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
 
-     **Note**: When querying a property that is not supported by the backend used by the VideoWriter
+     @note When querying a property that is not supported by the backend used by the VideoWriter
      class, value 0 is returned.
      */
     CV_WRAP virtual double get(int propId) const;
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index b5a44da3f7..1fedb0b66d 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -552,6 +552,20 @@ static Ptr<IVideoCapture> IVideoCapture_create(int index)
 }
 
 
+static Ptr<IVideoCapture> IVideoCapture_create(const String& filename)
+{
+    Ptr<IVideoCapture> capture;
+
+    capture = createMotionJpegCapture(filename);
+    if (capture && capture->isOpened())
+    {
+        return capture;
+    }
+
+    // failed open a camera
+    return Ptr<IVideoCapture>();
+}
+
 static Ptr<IVideoWriter> IVideoWriter_create(const String& filename, int _fourcc, double fps, Size frameSize, bool isColor)
 {
     Ptr<IVideoWriter> iwriter;
@@ -582,6 +596,10 @@ VideoCapture::~VideoCapture()
 bool VideoCapture::open(const String& filename)
 {
     if (isOpened()) release();
+    icap = IVideoCapture_create(filename);
+    if (!icap.empty())
+        return true;
+
     cap.reset(cvCreateFileCapture(filename.c_str()));
     return isOpened();
 }
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 5e371d21ee..f49301a8a8 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -657,13 +657,13 @@ bool CvCapture_FFMPEG::grabFrame()
         frame_number > ic->streams[video_stream]->nb_frames )
         return false;
 
-    av_free_packet (&packet);
-
     picture_pts = AV_NOPTS_VALUE_;
 
     // get the next frame
     while (!valid)
     {
+
+        av_free_packet (&packet);
         int ret = av_read_frame(ic, &packet);
         if (ret == AVERROR(EAGAIN)) continue;
 
@@ -706,8 +706,6 @@ bool CvCapture_FFMPEG::grabFrame()
             if (count_errs > max_number_of_attempts)
                 break;
         }
-
-        av_free_packet (&packet);
     }
 
     if( valid && first_frame_number < 0 )
@@ -788,7 +786,9 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
     case CV_FFMPEG_CAP_PROP_FRAME_HEIGHT:
         return (double)frame.height;
     case CV_FFMPEG_CAP_PROP_FPS:
-#if LIBAVCODEC_BUILD > 4753
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+        return av_q2d(video_st->avg_frame_rate);
+#elif LIBAVCODEC_BUILD > 4753
         return av_q2d(video_st->r_frame_rate);
 #else
         return (double)video_st->codec.frame_rate
@@ -836,7 +836,11 @@ int CvCapture_FFMPEG::get_bitrate() const
 
 double CvCapture_FFMPEG::get_fps() const
 {
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+    double fps = r2d(ic->streams[video_stream]->avg_frame_rate);
+#else
     double fps = r2d(ic->streams[video_stream]->r_frame_rate);
+#endif
 
 #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(52, 111, 0)
     if (fps < eps_zero)
@@ -997,6 +1001,7 @@ struct CvVideoWriter_FFMPEG
     int               input_pix_fmt;
     Image_FFMPEG      temp_image;
     int               frame_width, frame_height;
+    int               frame_idx;
     bool              ok;
     struct SwsContext *img_convert_ctx;
 };
@@ -1074,6 +1079,7 @@ void CvVideoWriter_FFMPEG::init()
     memset(&temp_image, 0, sizeof(temp_image));
     img_convert_ctx = 0;
     frame_width = frame_height = 0;
+    frame_idx = 0;
     ok = false;
 }
 
@@ -1225,7 +1231,7 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
      and qmin since they will be set to reasonable defaults by the libx264
      preset system. Also, use a crf encode with the default quality rating,
      this seems easier than finding an appropriate default bitrate. */
-    if (c->codec_id == CODEC_ID_H264) {
+    if (c->codec_id == AV_CODEC_ID_H264) {
       c->gop_size = -1;
       c->qmin = -1;
       c->bit_rate = 0;
@@ -1246,15 +1252,20 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
 
 static const int OPENCV_NO_FRAMES_WRITTEN_CODE = 1000;
 
-static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st, uint8_t * outbuf, uint32_t outbuf_size, AVFrame * picture )
+static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+                                      uint8_t *, uint32_t,
+#else
+                                      uint8_t * outbuf, uint32_t outbuf_size,
+#endif
+                                      AVFrame * picture )
 {
 #if LIBAVFORMAT_BUILD > 4628
     AVCodecContext * c = video_st->codec;
 #else
     AVCodecContext * c = &(video_st->codec);
 #endif
-    int out_size;
-    int ret = 0;
+    int ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
 
     if (oc->oformat->flags & AVFMT_RAWPICTURE) {
         /* raw video case. The API will change slightly in the near
@@ -1274,12 +1285,32 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
         ret = av_write_frame(oc, &pkt);
     } else {
         /* encode the image */
-        out_size = avcodec_encode_video(c, outbuf, outbuf_size, picture);
+        AVPacket pkt;
+        av_init_packet(&pkt);
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+        int got_output = 0;
+        pkt.data = NULL;
+        pkt.size = 0;
+        ret = avcodec_encode_video2(c, &pkt, picture, &got_output);
+        if (ret < 0)
+            ;
+        else if (got_output) {
+            if (pkt.pts != (int64_t)AV_NOPTS_VALUE)
+                pkt.pts = av_rescale_q(pkt.pts, c->time_base, video_st->time_base);
+            if (pkt.dts != (int64_t)AV_NOPTS_VALUE)
+                pkt.dts = av_rescale_q(pkt.dts, c->time_base, video_st->time_base);
+            if (pkt.duration)
+                pkt.duration = av_rescale_q(pkt.duration, c->time_base, video_st->time_base);
+            pkt.stream_index= video_st->index;
+            ret = av_write_frame(oc, &pkt);
+            av_free_packet(&pkt);
+        }
+        else
+            ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
+#else
+        int out_size = avcodec_encode_video(c, outbuf, outbuf_size, picture);
         /* if zero size, it means the image was buffered */
         if (out_size > 0) {
-            AVPacket pkt;
-            av_init_packet(&pkt);
-
 #if LIBAVFORMAT_BUILD > 4752
             if(c->coded_frame->pts != (int64_t)AV_NOPTS_VALUE)
                 pkt.pts = av_rescale_q(c->coded_frame->pts, c->time_base, video_st->time_base);
@@ -1294,9 +1325,8 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
 
             /* write the compressed frame in the media file */
             ret = av_write_frame(oc, &pkt);
-        } else {
-            ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
         }
+#endif
     }
     return ret;
 }
@@ -1403,7 +1433,9 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
                        (PixelFormat)input_pix_fmt, width, height);
     }
 
+    picture->pts = frame_idx;
     ret = icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, picture) >= 0;
+    frame_idx++;
 
     return ret;
 }
@@ -1715,6 +1747,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     }
     frame_width = width;
     frame_height = height;
+    frame_idx = 0;
     ok = true;
 
     return true;
diff --git a/modules/videoio/src/cap_mjpeg_decoder.cpp b/modules/videoio/src/cap_mjpeg_decoder.cpp
index 11a86b2ab3..53557fa9de 100644
--- a/modules/videoio/src/cap_mjpeg_decoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_decoder.cpp
@@ -40,13 +40,881 @@
 //M*/
 
 #include "precomp.hpp"
+#include <deque>
+#include <stdint.h>
 
 namespace cv
 {
 
-Ptr<IVideoCapture> createMotionJpegCapture(const String&)
+const uint32_t RIFF_CC = CV_FOURCC('R','I','F','F');
+const uint32_t LIST_CC = CV_FOURCC('L','I','S','T');
+const uint32_t HDRL_CC = CV_FOURCC('h','d','r','l');
+const uint32_t AVIH_CC = CV_FOURCC('a','v','i','h');
+const uint32_t STRL_CC = CV_FOURCC('s','t','r','l');
+const uint32_t STRH_CC = CV_FOURCC('s','t','r','h');
+const uint32_t VIDS_CC = CV_FOURCC('v','i','d','s');
+const uint32_t MJPG_CC = CV_FOURCC('M','J','P','G');
+const uint32_t MOVI_CC = CV_FOURCC('m','o','v','i');
+const uint32_t IDX1_CC = CV_FOURCC('i','d','x','1');
+const uint32_t AVI_CC  = CV_FOURCC('A','V','I',' ');
+const uint32_t AVIX_CC = CV_FOURCC('A','V','I','X');
+const uint32_t JUNK_CC = CV_FOURCC('J','U','N','K');
+const uint32_t INFO_CC = CV_FOURCC('I','N','F','O');
+
+String fourccToString(uint32_t fourcc);
+
+String fourccToString(uint32_t fourcc)
+{
+    return format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
+}
+
+#ifndef DWORD
+typedef uint32_t DWORD;
+#endif
+#ifndef WORD
+typedef uint16_t WORD;
+#endif
+#ifndef LONG
+typedef int32_t  LONG;
+#endif
+
+#pragma pack(push, 1)
+struct AviMainHeader
+{
+    DWORD dwMicroSecPerFrame;    //  The period between video frames
+    DWORD dwMaxBytesPerSec;      //  Maximum data rate of the file
+    DWORD dwReserved1;           // 0
+    DWORD dwFlags;               //  0x10 AVIF_HASINDEX: The AVI file has an idx1 chunk containing an index at the end of the file.
+    DWORD dwTotalFrames;         // Field of the main header specifies the total number of frames of data in file.
+    DWORD dwInitialFrames;       // Is used for interleaved files
+    DWORD dwStreams;             // Specifies the number of streams in the file.
+    DWORD dwSuggestedBufferSize; // Field specifies the suggested buffer size forreading the file
+    DWORD dwWidth;               // Fields specify the width of the AVIfile in pixels.
+    DWORD dwHeight;              // Fields specify the height of the AVIfile in pixels.
+    DWORD dwReserved[4];         // 0, 0, 0, 0
+};
+
+struct AviStreamHeader
+{
+    uint32_t fccType;              // 'vids', 'auds', 'txts'...
+    uint32_t fccHandler;           // "cvid", "DIB "
+    DWORD dwFlags;               // 0
+    DWORD dwPriority;            // 0
+    DWORD dwInitialFrames;       // 0
+    DWORD dwScale;               // 1
+    DWORD dwRate;                // Fps (dwRate - frame rate for video streams)
+    DWORD dwStart;               // 0
+    DWORD dwLength;              // Frames number (playing time of AVI file as defined by scale and rate)
+    DWORD dwSuggestedBufferSize; // For reading the stream
+    DWORD dwQuality;             // -1 (encoding quality. If set to -1, drivers use the default quality value)
+    DWORD dwSampleSize;          // 0 means that each frame is in its own chunk
+    struct {
+        short int left;
+        short int top;
+        short int right;
+        short int bottom;
+    } rcFrame;                // If stream has a different size than dwWidth*dwHeight(unused)
+};
+
+struct AviIndex
+{
+    DWORD ckid;
+    DWORD dwFlags;
+    DWORD dwChunkOffset;
+    DWORD dwChunkLength;
+};
+
+struct BitmapInfoHeader
+{
+    DWORD biSize;                // Write header size of BITMAPINFO header structure
+    LONG  biWidth;               // width in pixels
+    LONG  biHeight;              // heigth in pixels
+    WORD  biPlanes;              // Number of color planes in which the data is stored
+    WORD  biBitCount;            // Number of bits per pixel
+    DWORD biCompression;         // Type of compression used (uncompressed: NO_COMPRESSION=0)
+    DWORD biSizeImage;           // Image Buffer. Quicktime needs 3 bytes also for 8-bit png
+                                 //   (biCompression==NO_COMPRESSION)?0:xDim*yDim*bytesPerPixel;
+    LONG  biXPelsPerMeter;       // Horizontal resolution in pixels per meter
+    LONG  biYPelsPerMeter;       // Vertical resolution in pixels per meter
+    DWORD biClrUsed;             // 256 (color table size; for 8-bit only)
+    DWORD biClrImportant;        // Specifies that the first x colors of the color table. Are important to the DIB.
+};
+
+struct RiffChunk
+{
+    uint32_t m_four_cc;
+    uint32_t m_size;
+};
+
+struct RiffList
+{
+    uint32_t m_riff_or_list_cc;
+    uint32_t m_size;
+    uint32_t m_list_type_cc;
+};
+
+#pragma pack(pop)
+
+class MjpegInputStream
+{
+public:
+    MjpegInputStream();
+    MjpegInputStream(const String& filename);
+    ~MjpegInputStream();
+    MjpegInputStream& read(char*, uint64_t);
+    MjpegInputStream& seekg(uint64_t);
+    uint64_t tellg();
+    bool isOpened() const;
+    bool open(const String& filename);
+    void close();
+    operator bool();
+
+private:
+    bool    m_is_valid;
+    FILE*   m_f;
+};
+
+MjpegInputStream::MjpegInputStream(): m_is_valid(false), m_f(0)
+{
+}
+
+MjpegInputStream::MjpegInputStream(const String& filename): m_is_valid(false), m_f(0)
+{
+    open(filename);
+}
+
+bool MjpegInputStream::isOpened() const
+{
+    return m_f != 0;
+}
+
+bool MjpegInputStream::open(const String& filename)
+{
+    close();
+
+    m_f = fopen(filename.c_str(), "rb");
+
+    m_is_valid = isOpened();
+
+    return m_is_valid;
+}
+
+void MjpegInputStream::close()
+{
+    if(isOpened())
+    {
+        m_is_valid = false;
+
+        fclose(m_f);
+        m_f = 0;
+    }
+}
+
+MjpegInputStream& MjpegInputStream::read(char* buf, uint64_t count)
+{
+    if(isOpened())
+    {
+        m_is_valid = (count == fread((void*)buf, 1, (size_t)count, m_f));
+    }
+
+    return *this;
+}
+
+MjpegInputStream& MjpegInputStream::seekg(uint64_t pos)
+{
+    m_is_valid = (fseek(m_f, (long)pos, SEEK_SET) == 0);
+
+    return *this;
+}
+
+uint64_t MjpegInputStream::tellg()
+{
+    return ftell(m_f);
+}
+
+MjpegInputStream::operator bool()
+{
+    return m_is_valid;
+}
+
+MjpegInputStream::~MjpegInputStream()
+{
+    close();
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, AviMainHeader& avih);
+MjpegInputStream& operator >> (MjpegInputStream& is, AviStreamHeader& strh);
+MjpegInputStream& operator >> (MjpegInputStream& is, BitmapInfoHeader& bmph);
+MjpegInputStream& operator >> (MjpegInputStream& is, RiffList& riff_list);
+MjpegInputStream& operator >> (MjpegInputStream& is, RiffChunk& riff_chunk);
+MjpegInputStream& operator >> (MjpegInputStream& is, AviIndex& idx1);
+
+MjpegInputStream& operator >> (MjpegInputStream& is, AviMainHeader& avih)
+{
+    is.read((char*)(&avih), sizeof(AviMainHeader));
+    return is;
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, AviStreamHeader& strh)
+{
+    is.read((char*)(&strh), sizeof(AviStreamHeader));
+    return is;
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, BitmapInfoHeader& bmph)
+{
+    is.read((char*)(&bmph), sizeof(BitmapInfoHeader));
+    return is;
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, RiffList& riff_list)
+{
+    is.read((char*)(&riff_list), sizeof(riff_list));
+    return is;
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, RiffChunk& riff_chunk)
+{
+    is.read((char*)(&riff_chunk), sizeof(riff_chunk));
+    return is;
+}
+
+MjpegInputStream& operator >> (MjpegInputStream& is, AviIndex& idx1)
+{
+    is.read((char*)(&idx1), sizeof(idx1));
+    return is;
+}
+
+/*
+AVI struct:
+
+RIFF ('AVI '
+      LIST ('hdrl'
+            'avih'(<Main AVI Header>)
+            LIST ('strl'
+                  'strh'(<Stream header>)
+                  'strf'(<Stream format>)
+                  [ 'strd'(<Additional header data>) ]
+                  [ 'strn'(<Stream name>) ]
+                  [ 'indx'(<Odml index data>) ]
+                  ...
+                 )
+            [LIST ('strl' ...)]
+            [LIST ('strl' ...)]
+            ...
+            [LIST ('odml'
+                  'dmlh'(<ODML header data>)
+                  ...
+                 )
+            ]
+            ...
+           )
+      [LIST ('INFO' ...)]
+      [JUNK]
+      LIST ('movi'
+            {{xxdb|xxdc|xxpc|xxwb}(<Data>) | LIST ('rec '
+                              {xxdb|xxdc|xxpc|xxwb}(<Data>)
+                              {xxdb|xxdc|xxpc|xxwb}(<Data>)
+                              ...
+                             )
+               ...
+            }
+            ...
+           )
+      ['idx1' (<AVI Index>) ]
+     )
+
+     {xxdb|xxdc|xxpc|xxwb}
+     xx - stream number: 00, 01, 02, ...
+     db - uncompressed video frame
+     dc - commpressed video frame
+     pc - palette change
+     wb - audio frame
+
+     JUNK section may pad any data section and must be ignored
+*/
+
+typedef std::deque< std::pair<uint64_t, uint32_t> > frame_list;
+typedef frame_list::iterator frame_iterator;
+
+//Represents single MJPEG video stream within single AVI/AVIX entry
+//Multiple video streams within single AVI/AVIX entry are not supported
+//ODML index is not supported
+class AviMjpegStream
+{
+public:
+    AviMjpegStream();
+    //stores founded frames in m_frame_list which can be accessed via getFrames
+    bool parseAvi(MjpegInputStream& in_str);
+    //stores founded frames in in_frame_list. getFrames() would return empty list
+    bool parseAvi(MjpegInputStream& in_str, frame_list& in_frame_list);
+    size_t getFramesCount();
+    frame_list& getFrames();
+    uint32_t getWidth();
+    uint32_t getHeight();
+    double getFps();
+
+protected:
+
+    bool parseAviWithFrameList(MjpegInputStream& in_str, frame_list& in_frame_list);
+    void skipJunk(RiffChunk& chunk, MjpegInputStream& in_str);
+    void skipJunk(RiffList& list, MjpegInputStream& in_str);
+    bool parseHdrlList(MjpegInputStream& in_str);
+    bool parseIndex(MjpegInputStream& in_str, uint32_t index_size, frame_list& in_frame_list);
+    bool parseMovi(MjpegInputStream& in_str, frame_list& in_frame_list);
+    bool parseStrl(MjpegInputStream& in_str, uint8_t stream_id);
+    bool parseInfo(MjpegInputStream& in_str);
+    void printError(MjpegInputStream& in_str, RiffList& list, uint32_t expected_fourcc);
+    void printError(MjpegInputStream& in_str, RiffChunk& chunk, uint32_t expected_fourcc);
+
+    uint32_t   m_stream_id;
+    uint64_t   m_movi_start;
+    uint64_t   m_movi_end;
+    frame_list m_frame_list;
+    uint32_t   m_width;
+    uint32_t   m_height;
+    double     m_fps;
+    bool       m_is_indx_present;
+};
+
+AviMjpegStream::AviMjpegStream(): m_stream_id(0), m_movi_start(0), m_movi_end(0), m_width(0), m_height(0), m_fps(0), m_is_indx_present(false)
+{
+}
+
+size_t AviMjpegStream::getFramesCount()
+{
+    return m_frame_list.size();
+}
+
+frame_list& AviMjpegStream::getFrames()
+{
+    return m_frame_list;
+}
+
+uint32_t AviMjpegStream::getWidth()
+{
+    return m_width;
+}
+
+uint32_t AviMjpegStream::getHeight()
+{
+    return m_height;
+}
+
+double AviMjpegStream::getFps()
+{
+    return m_fps;
+}
+
+void AviMjpegStream::printError(MjpegInputStream& in_str, RiffList& list, uint32_t expected_fourcc)
+{
+    if(!in_str)
+    {
+        fprintf(stderr, "Unexpected end of file while searching for %s list\n", fourccToString(expected_fourcc).c_str());
+    }
+    else if(list.m_riff_or_list_cc != LIST_CC)
+    {
+        fprintf(stderr, "Unexpected element. Expected: %s. Got: %s.\n", fourccToString(LIST_CC).c_str(), fourccToString(list.m_riff_or_list_cc).c_str());
+    }
+    else
+    {
+        fprintf(stderr, "Unexpected list type. Expected: %s. Got: %s.\n", fourccToString(expected_fourcc).c_str(), fourccToString(list.m_list_type_cc).c_str());
+    }
+}
+
+void AviMjpegStream::printError(MjpegInputStream& in_str, RiffChunk& chunk, uint32_t expected_fourcc)
+{
+    if(!in_str)
+    {
+        fprintf(stderr, "Unexpected end of file while searching for %s chunk\n", fourccToString(expected_fourcc).c_str());
+    }
+    else
+    {
+        fprintf(stderr, "Unexpected element. Expected: %s. Got: %s.\n", fourccToString(expected_fourcc).c_str(), fourccToString(chunk.m_four_cc).c_str());
+    }
+}
+
+
+bool AviMjpegStream::parseMovi(MjpegInputStream&, frame_list&)
+{
+    //not implemented
+    return true;
+}
+
+bool AviMjpegStream::parseInfo(MjpegInputStream&)
+{
+    //not implemented
+    return true;
+}
+
+bool AviMjpegStream::parseIndex(MjpegInputStream& in_str, uint32_t index_size, frame_list& in_frame_list)
+{
+    uint64_t index_end = in_str.tellg();
+    index_end += index_size;
+    bool result = false;
+
+    while(in_str && (in_str.tellg() < index_end))
+    {
+        AviIndex idx1;
+        in_str >> idx1;
+
+        if(idx1.ckid == m_stream_id)
+        {
+            uint64_t absolute_pos = m_movi_start + idx1.dwChunkOffset;
+
+            if(absolute_pos < m_movi_end)
+            {
+                in_frame_list.push_back(std::make_pair(absolute_pos, idx1.dwChunkLength));
+            }
+            else
+            {
+                //unsupported case
+                fprintf(stderr, "Frame offset points outside movi section.\n");
+            }
+        }
+
+        result = true;
+    }
+
+    return result;
+}
+
+bool AviMjpegStream::parseStrl(MjpegInputStream& in_str, uint8_t stream_id)
+{
+    RiffChunk strh;
+    in_str >> strh;
+
+    if(in_str && strh.m_four_cc == STRH_CC)
+    {
+        uint64_t next_strl_list = in_str.tellg();
+        next_strl_list += strh.m_size;
+
+        AviStreamHeader strm_hdr;
+        in_str >> strm_hdr;
+
+        if(strm_hdr.fccType == VIDS_CC && strm_hdr.fccHandler == MJPG_CC)
+        {
+            uint8_t first_digit = (stream_id/10) + '0';
+            uint8_t second_digit = (stream_id%10) + '0';
+
+            if(m_stream_id == 0)
+            {
+                m_stream_id = CV_FOURCC(first_digit, second_digit, 'd', 'c');
+                m_fps = double(strm_hdr.dwRate)/strm_hdr.dwScale;
+            }
+            else
+            {
+                //second mjpeg video stream found which is not supported
+                fprintf(stderr, "More than one video stream found within AVI/AVIX list. Stream %c%cdc would be ignored\n", first_digit, second_digit);
+            }
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void AviMjpegStream::skipJunk(RiffChunk& chunk, MjpegInputStream& in_str)
+{
+    if(chunk.m_four_cc == JUNK_CC)
+    {
+        in_str.seekg(in_str.tellg() + chunk.m_size);
+        in_str >> chunk;
+    }
+}
+
+void AviMjpegStream::skipJunk(RiffList& list, MjpegInputStream& in_str)
+{
+    if(list.m_riff_or_list_cc == JUNK_CC)
+    {
+        //JUNK chunk is 4 bytes less than LIST
+        in_str.seekg(in_str.tellg() + list.m_size - 4);
+        in_str >> list;
+    }
+}
+
+bool AviMjpegStream::parseHdrlList(MjpegInputStream& in_str)
+{
+    bool result = false;
+
+    RiffChunk avih;
+    in_str >> avih;
+
+    if(in_str && avih.m_four_cc == AVIH_CC)
+    {
+        uint64_t next_strl_list = in_str.tellg();
+        next_strl_list += avih.m_size;
+
+        AviMainHeader avi_hdr;
+        in_str >> avi_hdr;
+
+        if(in_str)
+        {
+            m_is_indx_present = ((avi_hdr.dwFlags & 0x10) != 0);
+            DWORD number_of_streams = avi_hdr.dwStreams;
+            m_width = avi_hdr.dwWidth;
+            m_height = avi_hdr.dwWidth;
+
+            //the number of strl lists must be equal to number of streams specified in main avi header
+            for(DWORD i = 0; i < number_of_streams; ++i)
+            {
+                in_str.seekg(next_strl_list);
+                RiffList strl_list;
+                in_str >> strl_list;
+
+                if( in_str && strl_list.m_riff_or_list_cc == LIST_CC && strl_list.m_list_type_cc == STRL_CC )
+                {
+                    next_strl_list = in_str.tellg();
+                    //RiffList::m_size includes fourCC field which we have already read
+                    next_strl_list += (strl_list.m_size - 4);
+
+                    result = parseStrl(in_str, (uint8_t)i);
+                }
+                else
+                {
+                    printError(in_str, strl_list, STRL_CC);
+                }
+            }
+        }
+    }
+    else
+    {
+        printError(in_str, avih, AVIH_CC);
+    }
+
+    return result;
+}
+
+bool AviMjpegStream::parseAviWithFrameList(MjpegInputStream& in_str, frame_list& in_frame_list)
+{
+    RiffList hdrl_list;
+    in_str >> hdrl_list;
+
+    if( in_str && hdrl_list.m_riff_or_list_cc == LIST_CC && hdrl_list.m_list_type_cc == HDRL_CC )
+    {
+        uint64_t next_list = in_str.tellg();
+        //RiffList::m_size includes fourCC field which we have already read
+        next_list += (hdrl_list.m_size - 4);
+        //parseHdrlList sets m_is_indx_present flag which would be used later
+        if(parseHdrlList(in_str))
+        {
+            in_str.seekg(next_list);
+
+            RiffList some_list;
+            in_str >> some_list;
+
+            //an optional section INFO
+            if(in_str && some_list.m_riff_or_list_cc == LIST_CC && some_list.m_list_type_cc == INFO_CC)
+            {
+                next_list = in_str.tellg();
+                //RiffList::m_size includes fourCC field which we have already read
+                next_list += (some_list.m_size - 4);
+                parseInfo(in_str);
+
+                in_str.seekg(next_list);
+                in_str >> some_list;
+            }
+
+            //an optional section JUNK
+            skipJunk(some_list, in_str);
+
+            //we are expecting to find here movi list. Must present in avi
+            if(in_str && some_list.m_riff_or_list_cc == LIST_CC && some_list.m_list_type_cc == MOVI_CC)
+            {
+                bool is_index_found = false;
+
+                m_movi_start = in_str.tellg();
+                m_movi_start -= 4;
+
+                m_movi_end = m_movi_start + some_list.m_size;
+                //if m_is_indx_present is set to true we should find index
+                if(m_is_indx_present)
+                {
+                    //we are expecting to find index section after movi list
+                    uint32_t indx_pos = (uint32_t)m_movi_start + 4;
+                    indx_pos += (some_list.m_size - 4);
+                    in_str.seekg(indx_pos);
+
+                    RiffChunk index_chunk;
+                    in_str >> index_chunk;
+
+                    if(in_str && index_chunk.m_four_cc == IDX1_CC)
+                    {
+                        is_index_found = parseIndex(in_str, index_chunk.m_size, in_frame_list);
+                        //we are not going anywhere else
+                    }
+                    else
+                    {
+                        printError(in_str, index_chunk, IDX1_CC);
+                    }
+                }
+                //index not present or we were not able to find it
+                //parsing movi list
+                if(!is_index_found)
+                {
+                    //not implemented
+                    parseMovi(in_str, in_frame_list);
+
+                    fprintf(stderr, "Failed to parse avi: index was not found\n");
+                    //we are not going anywhere else
+                }
+            }
+            else
+            {
+                printError(in_str, some_list, MOVI_CC);
+            }
+        }
+    }
+    else
+    {
+        printError(in_str, hdrl_list, HDRL_CC);
+    }
+
+    return in_frame_list.size() > 0;
+}
+
+bool AviMjpegStream::parseAvi(MjpegInputStream& in_str, frame_list& in_frame_list)
+{
+    return parseAviWithFrameList(in_str, in_frame_list);
+}
+
+bool AviMjpegStream::parseAvi(MjpegInputStream& in_str)
+{
+    return parseAviWithFrameList(in_str, m_frame_list);
+}
+
+
+class MotionJpegCapture: public IVideoCapture
+{
+public:
+    virtual ~MotionJpegCapture();
+    virtual double getProperty(int) const;
+    virtual bool setProperty(int, double);
+    virtual bool grabFrame();
+    virtual bool retrieveFrame(int, OutputArray);
+    virtual bool isOpened() const;
+    virtual int getCaptureDomain() { return CAP_ANY; } // Return the type of the capture object: CAP_VFW, etc...
+    MotionJpegCapture(const String&);
+
+    bool open(const String&);
+    void close();
+protected:
+
+    bool parseRiff(MjpegInputStream& in_str);
+
+    inline uint64_t getFramePos() const;
+    std::vector<char> readFrame(frame_iterator it);
+
+    MjpegInputStream m_file_stream;
+    bool             m_is_first_frame;
+    frame_list       m_mjpeg_frames;
+
+    frame_iterator   m_frame_iterator;
+    Mat              m_current_frame;
+
+    //frame width/height and fps could be different for
+    //each frame/stream. At the moment we suppose that they
+    //stays the same within single avi file.
+    uint32_t         m_frame_width;
+    uint32_t         m_frame_height;
+    double           m_fps;
+};
+
+uint64_t MotionJpegCapture::getFramePos() const
+{
+    if(m_is_first_frame)
+        return 0;
+
+    if(m_frame_iterator == m_mjpeg_frames.end())
+        return m_mjpeg_frames.size();
+
+    return m_frame_iterator - m_mjpeg_frames.begin() + 1;
+}
+
+bool MotionJpegCapture::setProperty(int property, double value)
+{
+    if(property == CAP_PROP_POS_FRAMES)
+    {
+        if(int(value) == 0)
+        {
+            m_is_first_frame = true;
+            m_frame_iterator = m_mjpeg_frames.end();
+            return true;
+        }
+        else if(m_mjpeg_frames.size() > value)
+        {
+            m_frame_iterator = m_mjpeg_frames.begin() + int(value - 1);
+            m_is_first_frame = false;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+double MotionJpegCapture::getProperty(int property) const
+{
+    switch(property)
+    {
+        case CAP_PROP_POS_FRAMES:
+            return (double)getFramePos();
+        case CAP_PROP_POS_AVI_RATIO:
+            return double(getFramePos())/m_mjpeg_frames.size();
+        case CAP_PROP_FRAME_WIDTH:
+            return (double)m_frame_width;
+        case CAP_PROP_FRAME_HEIGHT:
+            return (double)m_frame_height;
+        case CAP_PROP_FPS:
+            return m_fps;
+        case CAP_PROP_FOURCC:
+            return (double)CV_FOURCC('M','J','P','G');
+        case CAP_PROP_FRAME_COUNT:
+            return (double)m_mjpeg_frames.size();
+        case CAP_PROP_FORMAT:
+            return 0;
+        default:
+            return 0;
+    }
+}
+
+std::vector<char> MotionJpegCapture::readFrame(frame_iterator it)
+{
+    m_file_stream.seekg(it->first);
+
+    RiffChunk chunk;
+    m_file_stream >> chunk;
+
+    std::vector<char> result;
+
+    result.reserve(chunk.m_size);
+    result.resize(chunk.m_size);
+
+    m_file_stream.read(result.data(), chunk.m_size);
+
+    return result;
+}
+
+bool MotionJpegCapture::grabFrame()
+{
+    if(isOpened())
+    {
+        if(m_is_first_frame)
+        {
+            m_is_first_frame = false;
+            m_frame_iterator = m_mjpeg_frames.begin();
+        }
+        else
+        {
+            ++m_frame_iterator;
+        }
+    }
+
+    return m_frame_iterator != m_mjpeg_frames.end();
+}
+
+bool MotionJpegCapture::retrieveFrame(int, OutputArray output_frame)
+{
+    if(m_frame_iterator != m_mjpeg_frames.end())
+    {
+        std::vector<char> data = readFrame(m_frame_iterator);
+
+        if(data.size())
+        {
+            m_current_frame = imdecode(data, CV_LOAD_IMAGE_ANYDEPTH | CV_LOAD_IMAGE_COLOR);
+        }
+
+        m_current_frame.copyTo(output_frame);
+
+        return true;
+    }
+
+    return false;
+}
+
+MotionJpegCapture::~MotionJpegCapture()
+{
+    close();
+}
+
+MotionJpegCapture::MotionJpegCapture(const String& filename)
+{
+    open(filename);
+}
+
+bool MotionJpegCapture::isOpened() const
+{
+    return m_mjpeg_frames.size() > 0;
+}
+
+void MotionJpegCapture::close()
+{
+    m_file_stream.close();
+    m_frame_iterator = m_mjpeg_frames.end();
+}
+
+bool MotionJpegCapture::open(const String& filename)
+{
+    close();
+
+    m_file_stream.open(filename);
+
+    m_frame_iterator = m_mjpeg_frames.end();
+    m_is_first_frame = true;
+
+    if(!parseRiff(m_file_stream))
+    {
+        close();
+    }
+
+    return isOpened();
+}
+
+
+bool MotionJpegCapture::parseRiff(MjpegInputStream& in_str)
+{
+    bool result = false;
+    while(in_str)
+    {
+        RiffList riff_list;
+
+        in_str >> riff_list;
+
+        if( in_str && riff_list.m_riff_or_list_cc == RIFF_CC &&
+            ((riff_list.m_list_type_cc == AVI_CC) | (riff_list.m_list_type_cc == AVIX_CC)) )
+        {
+            uint64_t next_riff = in_str.tellg();
+            //RiffList::m_size includes fourCC field which we have already read
+            next_riff += (riff_list.m_size - 4);
+
+            AviMjpegStream mjpeg_video_stream;
+            bool is_parsed = mjpeg_video_stream.parseAvi(in_str, m_mjpeg_frames);
+            result = result || is_parsed;
+
+            if(is_parsed)
+            {
+                m_frame_width = mjpeg_video_stream.getWidth();
+                m_frame_height = mjpeg_video_stream.getHeight();
+                m_fps = mjpeg_video_stream.getFps();
+            }
+
+            in_str.seekg(next_riff);
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    return result;
+}
+
+Ptr<IVideoCapture> createMotionJpegCapture(const String& filename)
 {
-    return Ptr<IVideoCapture>();
+    Ptr<MotionJpegCapture> mjdecoder(new MotionJpegCapture(filename));
+    if( mjdecoder->isOpened() )
+        return mjdecoder;
+    return Ptr<MotionJpegCapture>();
 }
 
 }
diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index 7856fd416f..2a8e0b9645 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -248,6 +248,25 @@ public:
             writeBlock();
     }
 
+    void jflush(unsigned currval, int bitIdx)
+    {
+        uchar v;
+        uchar* ptr = m_current;
+        currval |= (1 << bitIdx)-1;
+        while( bitIdx < 32 )
+        {
+            v = (uchar)(currval >> 24);
+            *ptr++ = v;
+            if( v == 255 )
+                *ptr++ = 0;
+            currval <<= 8;
+            bitIdx += 8;
+        }
+        m_current = ptr;
+        if( m_current >= m_end )
+            writeBlock();
+    }
+
     static bool createEncodeHuffmanTable( const int* src, unsigned* table, int max_size )
     {
         int  i, k;
@@ -1440,7 +1459,7 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
     }
 
     // Flush
-    JPUT_BITS((unsigned)-1, bit_idx & 31);
+    strm.jflush(currval, bit_idx);
     strm.jputShort( 0xFFD9 ); // EOI marker
     /*printf("total dct = %.1fms, total cvt = %.1fms\n",
      total_dct*1000./cv::getTickFrequency(),
diff --git a/modules/videoio/src/cap_msmf.hpp b/modules/videoio/src/cap_msmf.hpp
index 4fdf41fb31..0987c704f5 100644
--- a/modules/videoio/src/cap_msmf.hpp
+++ b/modules/videoio/src/cap_msmf.hpp
@@ -603,11 +603,6 @@ public:
     ComPtr() throw()
     {
     }
-    ComPtr(int nNull) throw()
-    {
-        assert(nNull == 0);
-        p = NULL;
-    }
     ComPtr(T* lp) throw()
     {
         p = lp;
@@ -638,13 +633,6 @@ public:
     {
         return p.operator==(pT);
     }
-    // For comparison to NULL
-    bool operator==(int nNull) const
-    {
-        assert(nNull == 0);
-        return p.operator==(NULL);
-    }
-
     bool operator!=(_In_opt_ T* pT) const throw()
     {
         return p.operator!=(pT);
@@ -3123,7 +3111,7 @@ public:
         HRESULT hr = CheckShutdown();
 
         if (SUCCEEDED(hr)) {
-            if (m_spClock == NULL) {
+            if (!m_spClock) {
                 hr = MF_E_NO_CLOCK; // There is no presentation clock.
             } else {
                 // Return the pointer to the caller.
diff --git a/modules/videoio/src/cap_ximea.cpp b/modules/videoio/src/cap_ximea.cpp
index 8356b4d92a..ccf49e45b5 100644
--- a/modules/videoio/src/cap_ximea.cpp
+++ b/modules/videoio/src/cap_ximea.cpp
@@ -52,7 +52,15 @@ CvCapture* cvCreateCameraCapture_XIMEA( int index )
 // Enumerate connected devices
 void CvCaptureCAM_XIMEA::init()
 {
+#if defined WIN32 || defined _WIN32
     xiGetNumberDevices( &numDevices);
+#else
+    // try second re-enumeration if first one fails
+    if (xiGetNumberDevices( &numDevices) != XI_OK)
+    {
+        xiGetNumberDevices( &numDevices);
+    }
+#endif
     hmv = NULL;
     frame = NULL;
     timeout = 0;
@@ -73,8 +81,17 @@ bool CvCaptureCAM_XIMEA::open( int wIndex )
 
     if((mvret = xiOpenDevice( wIndex, &hmv)) != XI_OK)
     {
+#if defined WIN32 || defined _WIN32
         errMsg("Open XI_DEVICE failed", mvret);
         return false;
+#else
+        // try opening second time if first fails
+        if((mvret = xiOpenDevice( wIndex, &hmv))  != XI_OK)
+        {
+            errMsg("Open XI_DEVICE failed", mvret);
+            return false;
+        }
+#endif
     }
 
     int width   = 0;
@@ -260,7 +277,7 @@ double CvCaptureCAM_XIMEA::getProperty( int property_id ) const
     case CV_CAP_PROP_XI_AUTO_WB       : xiGetParamInt( hmv, XI_PRM_AUTO_WB, &ival); return ival;
     case CV_CAP_PROP_XI_AEAG          : xiGetParamInt( hmv, XI_PRM_AEAG, &ival); return ival;
     case CV_CAP_PROP_XI_EXP_PRIORITY  : xiGetParamFloat( hmv, XI_PRM_EXP_PRIORITY, &fval); return fval;
-    case CV_CAP_PROP_XI_AE_MAX_LIMIT  : xiGetParamInt( hmv, XI_PRM_AE_MAX_LIMIT, &ival); return ival;
+    case CV_CAP_PROP_XI_AE_MAX_LIMIT  : xiGetParamInt( hmv, XI_PRM_EXP_PRIORITY, &ival); return ival;
     case CV_CAP_PROP_XI_AG_MAX_LIMIT  : xiGetParamFloat( hmv, XI_PRM_AG_MAX_LIMIT, &fval); return fval;
     case CV_CAP_PROP_XI_AEAG_LEVEL    : xiGetParamInt( hmv, XI_PRM_AEAG_LEVEL, &ival); return ival;
     case CV_CAP_PROP_XI_TIMEOUT       : return timeout;
@@ -293,7 +310,7 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
     case CV_CAP_PROP_XI_OFFSET_Y      : mvret = xiSetParamInt( hmv, XI_PRM_OFFSET_Y, ival); break;
     case CV_CAP_PROP_XI_TRG_SOURCE    : mvret = xiSetParamInt( hmv, XI_PRM_TRG_SOURCE, ival); break;
     case CV_CAP_PROP_XI_GPI_SELECTOR  : mvret = xiSetParamInt( hmv, XI_PRM_GPI_SELECTOR, ival); break;
-    case CV_CAP_PROP_XI_TRG_SOFTWARE  : mvret = xiSetParamInt( hmv, XI_PRM_TRG_SOFTWARE, 1); break;
+    case CV_CAP_PROP_XI_TRG_SOFTWARE  : mvret = xiSetParamInt( hmv, XI_PRM_TRG_SOURCE, 1); break;
     case CV_CAP_PROP_XI_GPI_MODE      : mvret = xiSetParamInt( hmv, XI_PRM_GPI_MODE, ival); break;
     case CV_CAP_PROP_XI_GPI_LEVEL     : mvret = xiSetParamInt( hmv, XI_PRM_GPI_LEVEL, ival); break;
     case CV_CAP_PROP_XI_GPO_SELECTOR  : mvret = xiSetParamInt( hmv, XI_PRM_GPO_SELECTOR, ival); break;
@@ -301,10 +318,10 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
     case CV_CAP_PROP_XI_LED_SELECTOR  : mvret = xiSetParamInt( hmv, XI_PRM_LED_SELECTOR, ival); break;
     case CV_CAP_PROP_XI_LED_MODE      : mvret = xiSetParamInt( hmv, XI_PRM_LED_MODE, ival); break;
     case CV_CAP_PROP_XI_AUTO_WB       : mvret = xiSetParamInt( hmv, XI_PRM_AUTO_WB, ival); break;
-    case CV_CAP_PROP_XI_MANUAL_WB     : mvret = xiSetParamInt( hmv, XI_PRM_MANUAL_WB, ival); break;
+    case CV_CAP_PROP_XI_MANUAL_WB     : mvret = xiSetParamInt( hmv, XI_PRM_LED_MODE, ival); break;
     case CV_CAP_PROP_XI_AEAG          : mvret = xiSetParamInt( hmv, XI_PRM_AEAG, ival); break;
     case CV_CAP_PROP_XI_EXP_PRIORITY  : mvret = xiSetParamFloat( hmv, XI_PRM_EXP_PRIORITY, fval); break;
-    case CV_CAP_PROP_XI_AE_MAX_LIMIT  : mvret = xiSetParamInt( hmv, XI_PRM_AE_MAX_LIMIT, ival); break;
+    case CV_CAP_PROP_XI_AE_MAX_LIMIT  : mvret = xiSetParamInt( hmv, XI_PRM_EXP_PRIORITY, ival); break;
     case CV_CAP_PROP_XI_AG_MAX_LIMIT  : mvret = xiSetParamFloat( hmv, XI_PRM_AG_MAX_LIMIT, fval); break;
     case CV_CAP_PROP_XI_AEAG_LEVEL    : mvret = xiSetParamInt( hmv, XI_PRM_AEAG_LEVEL, ival); break;
     case CV_CAP_PROP_XI_TIMEOUT       : timeout = ival; break;
diff --git a/modules/videoio/src/ffmpeg_codecs.hpp b/modules/videoio/src/ffmpeg_codecs.hpp
index 5bdd4cd227..e8c661aaac 100644
--- a/modules/videoio/src/ffmpeg_codecs.hpp
+++ b/modules/videoio/src/ffmpeg_codecs.hpp
@@ -94,160 +94,223 @@ typedef struct AVCodecTag {
     unsigned int tag;
 } AVCodecTag;
 
+#if (LIBAVCODEC_VERSION_INT <= AV_VERSION_INT(54, 51, 100))
+#define AV_CODEC_ID_H264 CODEC_ID_H264
+#define AV_CODEC_ID_H263 CODEC_ID_H263
+#define AV_CODEC_ID_H263P CODEC_ID_H263P
+#define AV_CODEC_ID_H261 CODEC_ID_H261
+#define AV_CODEC_ID_MPEG4 CODEC_ID_MPEG4
+#define AV_CODEC_ID_MSMPEG4V3 CODEC_ID_MSMPEG4V3
+#define AV_CODEC_ID_MSMPEG4V2 CODEC_ID_MSMPEG4V2
+#define AV_CODEC_ID_MSMPEG4V1 CODEC_ID_MSMPEG4V1
+#define AV_CODEC_ID_WMV1 CODEC_ID_WMV1
+#define AV_CODEC_ID_WMV2 CODEC_ID_WMV1
+#define AV_CODEC_ID_DVVIDEO CODEC_ID_DVVIDEO
+#define AV_CODEC_ID_MPEG1VIDEO CODEC_ID_MPEG1VIDEO
+#define AV_CODEC_ID_MPEG2VIDEO CODEC_ID_MPEG2VIDEO
+#define AV_CODEC_ID_MJPEG CODEC_ID_MJPEG
+#define AV_CODEC_ID_LJPEG CODEC_ID_LJPEG
+#define AV_CODEC_ID_HUFFYUV CODEC_ID_HUFFYUV
+#define AV_CODEC_ID_FFVHUFF CODEC_ID_FFVHUFF
+#define AV_CODEC_ID_CYUV CODEC_ID_CYUV
+#define AV_CODEC_ID_RAWVIDEO CODEC_ID_RAWVIDEO
+#define AV_CODEC_ID_INDEO3 CODEC_ID_INDEO3
+#define AV_CODEC_ID_VP3 CODEC_ID_VP3
+#define AV_CODEC_ID_ASV1 CODEC_ID_ASV1
+#define AV_CODEC_ID_ASV2 CODEC_ID_ASV2
+#define AV_CODEC_ID_VCR1 CODEC_ID_VCR1
+#define AV_CODEC_ID_FFV1 CODEC_ID_FFV1
+#define AV_CODEC_ID_XAN_WC4 CODEC_ID_XAN_WC4
+#define AV_CODEC_ID_MSRLE CODEC_ID_MSRLE
+#define AV_CODEC_ID_MSVIDEO1 CODEC_ID_MSVIDEO1
+#define AV_CODEC_ID_CINEPAK CODEC_ID_CINEPAK
+#define AV_CODEC_ID_TRUEMOTION1 CODEC_ID_TRUEMOTION1
+#define AV_CODEC_ID_MSZH CODEC_ID_MSZH
+#define AV_CODEC_ID_ZLIB CODEC_ID_ZLIB
+#define AV_CODEC_ID_SNOW CODEC_ID_SNOW
+#define AV_CODEC_ID_4XM CODEC_ID_4XM
+#define AV_CODEC_ID_FLV1 CODEC_ID_FLV1
+#define AV_CODEC_ID_SVQ1 CODEC_ID_SVQ1
+#define AV_CODEC_ID_TSCC CODEC_ID_TSCC
+#define AV_CODEC_ID_ULTI CODEC_ID_ULTI
+#define AV_CODEC_ID_VIXL CODEC_ID_VIXL
+#define AV_CODEC_ID_QPEG CODEC_ID_QPEG
+#define AV_CODEC_ID_WMV3 CODEC_ID_WMV3
+#define AV_CODEC_ID_LOCO CODEC_ID_LOCO
+#define AV_CODEC_ID_THEORA CODEC_ID_THEORA
+#define AV_CODEC_ID_WNV CODEC_ID_WNV
+#define AV_CODEC_ID_AASC CODEC_ID_AASC
+#define AV_CODEC_ID_INDEO2 CODEC_ID_INDEO2
+#define AV_CODEC_ID_FRAPS CODEC_ID_FRAPS
+#define AV_CODEC_ID_TRUEMOTION2 CODEC_ID_TRUEMOTION2
+#define AV_CODEC_ID_FLASHSV CODEC_ID_FLASHSV
+#define AV_CODEC_ID_JPEGLS CODEC_ID_JPEGLS
+#define AV_CODEC_ID_VC1 CODEC_ID_VC1
+#define AV_CODEC_ID_CSCD CODEC_ID_CSCD
+#define AV_CODEC_ID_ZMBV CODEC_ID_ZMBV
+#define AV_CODEC_ID_KMVC CODEC_ID_KMVC
+#define AV_CODEC_ID_VP5 CODEC_ID_VP5
+#define AV_CODEC_ID_VP6 CODEC_ID_VP6
+#define AV_CODEC_ID_VP6F CODEC_ID_VP6F
+#define AV_CODEC_ID_JPEG2000 CODEC_ID_JPEG2000
+#define AV_CODEC_ID_VMNC CODEC_ID_VMNC
+#define AV_CODEC_ID_TARGA CODEC_ID_TARGA
+#define AV_CODEC_ID_NONE CODEC_ID_NONE
+#endif
+
 const AVCodecTag codec_bmp_tags[] = {
-    { CODEC_ID_H264, MKTAG('H', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('h', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('X', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('x', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('a', 'v', 'c', '1') },
-    { CODEC_ID_H264, MKTAG('V', 'S', 'S', 'H') },
-
-    { CODEC_ID_H263, MKTAG('H', '2', '6', '3') },
-    { CODEC_ID_H263P, MKTAG('H', '2', '6', '3') },
-    { CODEC_ID_H263I, MKTAG('I', '2', '6', '3') }, /* intel h263 */
-    { CODEC_ID_H261, MKTAG('H', '2', '6', '1') },
+    { AV_CODEC_ID_H264, MKTAG('H', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('h', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('X', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('x', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('a', 'v', 'c', '1') },
+    { AV_CODEC_ID_H264, MKTAG('V', 'S', 'S', 'H') },
+
+    { AV_CODEC_ID_H263, MKTAG('H', '2', '6', '3') },
+    { AV_CODEC_ID_H263P, MKTAG('H', '2', '6', '3') },
+    { AV_CODEC_ID_H263I, MKTAG('I', '2', '6', '3') }, /* intel h263 */
+    { AV_CODEC_ID_H261, MKTAG('H', '2', '6', '1') },
 
     /* added based on MPlayer */
-    { CODEC_ID_H263P, MKTAG('U', '2', '6', '3') },
-    { CODEC_ID_H263P, MKTAG('v', 'i', 'v', '1') },
+    { AV_CODEC_ID_H263P, MKTAG('U', '2', '6', '3') },
+    { AV_CODEC_ID_H263P, MKTAG('v', 'i', 'v', '1') },
 
-    { CODEC_ID_MPEG4, MKTAG('F', 'M', 'P', '4') },
-    { CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', 'X') },
-    { CODEC_ID_MPEG4, MKTAG('D', 'X', '5', '0') },
-    { CODEC_ID_MPEG4, MKTAG('X', 'V', 'I', 'D') },
-    { CODEC_ID_MPEG4, MKTAG('M', 'P', '4', 'S') },
-    { CODEC_ID_MPEG4, MKTAG('M', '4', 'S', '2') },
-    { CODEC_ID_MPEG4, MKTAG(0x04, 0, 0, 0) }, /* some broken avi use this */
+    { AV_CODEC_ID_MPEG4, MKTAG('F', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', 'X') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'X', '5', '0') },
+    { AV_CODEC_ID_MPEG4, MKTAG('X', 'V', 'I', 'D') },
+    { AV_CODEC_ID_MPEG4, MKTAG('M', 'P', '4', 'S') },
+    { AV_CODEC_ID_MPEG4, MKTAG('M', '4', 'S', '2') },
+    { AV_CODEC_ID_MPEG4, MKTAG(0x04, 0, 0, 0) }, /* some broken avi use this */
 
     /* added based on MPlayer */
-    { CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', '1') },
-    { CODEC_ID_MPEG4, MKTAG('B', 'L', 'Z', '0') },
-    { CODEC_ID_MPEG4, MKTAG('m', 'p', '4', 'v') },
-    { CODEC_ID_MPEG4, MKTAG('U', 'M', 'P', '4') },
-    { CODEC_ID_MPEG4, MKTAG('W', 'V', '1', 'F') },
-    { CODEC_ID_MPEG4, MKTAG('S', 'E', 'D', 'G') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', '1') },
+    { AV_CODEC_ID_MPEG4, MKTAG('B', 'L', 'Z', '0') },
+    { AV_CODEC_ID_MPEG4, MKTAG('m', 'p', '4', 'v') },
+    { AV_CODEC_ID_MPEG4, MKTAG('U', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('W', 'V', '1', 'F') },
+    { AV_CODEC_ID_MPEG4, MKTAG('S', 'E', 'D', 'G') },
 
-    { CODEC_ID_MPEG4, MKTAG('R', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('R', 'M', 'P', '4') },
 
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '3') }, /* default signature when using MSMPEG4 */
-    { CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', '4', '3') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '3') }, /* default signature when using MSMPEG4 */
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', '4', '3') },
 
     /* added based on MPlayer */
-    { CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', 'G', '3') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '5') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '6') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '4') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('A', 'P', '4', '1') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '1') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '0') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', 'G', '3') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '5') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '6') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '4') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('A', 'P', '4', '1') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '1') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '0') },
 
-    { CODEC_ID_MSMPEG4V2, MKTAG('M', 'P', '4', '2') },
+    { AV_CODEC_ID_MSMPEG4V2, MKTAG('M', 'P', '4', '2') },
 
     /* added based on MPlayer */
-    { CODEC_ID_MSMPEG4V2, MKTAG('D', 'I', 'V', '2') },
+    { AV_CODEC_ID_MSMPEG4V2, MKTAG('D', 'I', 'V', '2') },
 
-    { CODEC_ID_MSMPEG4V1, MKTAG('M', 'P', 'G', '4') },
+    { AV_CODEC_ID_MSMPEG4V1, MKTAG('M', 'P', 'G', '4') },
 
-    { CODEC_ID_WMV1, MKTAG('W', 'M', 'V', '1') },
+    { AV_CODEC_ID_WMV1, MKTAG('W', 'M', 'V', '1') },
 
     /* added based on MPlayer */
-    { CODEC_ID_WMV2, MKTAG('W', 'M', 'V', '2') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'd') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 'h', 'd') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'l') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', '2', '5') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '1') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '2') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', '2') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('M', 'P', 'E', 'G') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('P', 'I', 'M', '1') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('V', 'C', 'R', '2') },
-    { CODEC_ID_MPEG1VIDEO, 0x10000001 },
-    { CODEC_ID_MPEG2VIDEO, 0x10000002 },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('D', 'V', 'R', ' ') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('M', 'M', 'E', 'S') },
-    { CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') },
-    { CODEC_ID_LJPEG, MKTAG('L', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('J', 'P', 'G', 'L') }, /* Pegasus lossless JPEG */
-    { CODEC_ID_MJPEG, MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - decoder */
-    { CODEC_ID_MJPEG, MKTAG('j', 'p', 'e', 'g') },
-    { CODEC_ID_MJPEG, MKTAG('I', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('A', 'V', 'R', 'n') },
-    { CODEC_ID_HUFFYUV, MKTAG('H', 'F', 'Y', 'U') },
-    { CODEC_ID_FFVHUFF, MKTAG('F', 'F', 'V', 'H') },
-    { CODEC_ID_CYUV, MKTAG('C', 'Y', 'U', 'V') },
-    { CODEC_ID_RAWVIDEO, 0 },
-    { CODEC_ID_RAWVIDEO, MKTAG('I', '4', '2', '0') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', 'U', 'Y', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', '4', '2', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', 'V', '1', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('U', 'Y', 'V', 'Y') },
-    { CODEC_ID_RAWVIDEO, MKTAG('I', 'Y', 'U', 'V') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', '8', '0', '0') },
-    { CODEC_ID_RAWVIDEO, MKTAG('H', 'D', 'Y', 'C') },
-    { CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '1') },
-    { CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '2') },
-    { CODEC_ID_VP3, MKTAG('V', 'P', '3', '1') },
-    { CODEC_ID_VP3, MKTAG('V', 'P', '3', '0') },
-    { CODEC_ID_ASV1, MKTAG('A', 'S', 'V', '1') },
-    { CODEC_ID_ASV2, MKTAG('A', 'S', 'V', '2') },
-    { CODEC_ID_VCR1, MKTAG('V', 'C', 'R', '1') },
-    { CODEC_ID_FFV1, MKTAG('F', 'F', 'V', '1') },
-    { CODEC_ID_XAN_WC4, MKTAG('X', 'x', 'a', 'n') },
-    { CODEC_ID_MSRLE, MKTAG('m', 'r', 'l', 'e') },
-    { CODEC_ID_MSRLE, MKTAG(0x1, 0x0, 0x0, 0x0) },
-    { CODEC_ID_MSVIDEO1, MKTAG('M', 'S', 'V', 'C') },
-    { CODEC_ID_MSVIDEO1, MKTAG('m', 's', 'v', 'c') },
-    { CODEC_ID_MSVIDEO1, MKTAG('C', 'R', 'A', 'M') },
-    { CODEC_ID_MSVIDEO1, MKTAG('c', 'r', 'a', 'm') },
-    { CODEC_ID_MSVIDEO1, MKTAG('W', 'H', 'A', 'M') },
-    { CODEC_ID_MSVIDEO1, MKTAG('w', 'h', 'a', 'm') },
-    { CODEC_ID_CINEPAK, MKTAG('c', 'v', 'i', 'd') },
-    { CODEC_ID_TRUEMOTION1, MKTAG('D', 'U', 'C', 'K') },
-    { CODEC_ID_MSZH, MKTAG('M', 'S', 'Z', 'H') },
-    { CODEC_ID_ZLIB, MKTAG('Z', 'L', 'I', 'B') },
-    { CODEC_ID_SNOW, MKTAG('S', 'N', 'O', 'W') },
-    { CODEC_ID_4XM, MKTAG('4', 'X', 'M', 'V') },
-    { CODEC_ID_FLV1, MKTAG('F', 'L', 'V', '1') },
-    { CODEC_ID_SVQ1, MKTAG('s', 'v', 'q', '1') },
-    { CODEC_ID_TSCC, MKTAG('t', 's', 'c', 'c') },
-    { CODEC_ID_ULTI, MKTAG('U', 'L', 'T', 'I') },
-    { CODEC_ID_VIXL, MKTAG('V', 'I', 'X', 'L') },
-    { CODEC_ID_QPEG, MKTAG('Q', 'P', 'E', 'G') },
-    { CODEC_ID_QPEG, MKTAG('Q', '1', '.', '0') },
-    { CODEC_ID_QPEG, MKTAG('Q', '1', '.', '1') },
-    { CODEC_ID_WMV3, MKTAG('W', 'M', 'V', '3') },
-    { CODEC_ID_LOCO, MKTAG('L', 'O', 'C', 'O') },
-    { CODEC_ID_THEORA, MKTAG('t', 'h', 'e', 'o') },
+    { AV_CODEC_ID_WMV2, MKTAG('W', 'M', 'V', '2') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'd') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 'h', 'd') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'l') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', '2', '5') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '1') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '2') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', '2') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('M', 'P', 'E', 'G') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('P', 'I', 'M', '1') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('V', 'C', 'R', '2') },
+    { AV_CODEC_ID_MPEG1VIDEO, 0x10000001 },
+    { AV_CODEC_ID_MPEG2VIDEO, 0x10000002 },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('D', 'V', 'R', ' ') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('M', 'M', 'E', 'S') },
+    { AV_CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') },
+    { AV_CODEC_ID_LJPEG, MKTAG('L', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('J', 'P', 'G', 'L') }, /* Pegasus lossless JPEG */
+    { AV_CODEC_ID_MJPEG, MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - decoder */
+    { AV_CODEC_ID_MJPEG, MKTAG('j', 'p', 'e', 'g') },
+    { AV_CODEC_ID_MJPEG, MKTAG('I', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('A', 'V', 'R', 'n') },
+    { AV_CODEC_ID_HUFFYUV, MKTAG('H', 'F', 'Y', 'U') },
+    { AV_CODEC_ID_FFVHUFF, MKTAG('F', 'F', 'V', 'H') },
+    { AV_CODEC_ID_CYUV, MKTAG('C', 'Y', 'U', 'V') },
+    { AV_CODEC_ID_RAWVIDEO, 0 },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('I', '4', '2', '0') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', 'U', 'Y', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', '4', '2', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', 'V', '1', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('U', 'Y', 'V', 'Y') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('I', 'Y', 'U', 'V') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', '8', '0', '0') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('H', 'D', 'Y', 'C') },
+    { AV_CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '1') },
+    { AV_CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '2') },
+    { AV_CODEC_ID_VP3, MKTAG('V', 'P', '3', '1') },
+    { AV_CODEC_ID_VP3, MKTAG('V', 'P', '3', '0') },
+    { AV_CODEC_ID_ASV1, MKTAG('A', 'S', 'V', '1') },
+    { AV_CODEC_ID_ASV2, MKTAG('A', 'S', 'V', '2') },
+    { AV_CODEC_ID_VCR1, MKTAG('V', 'C', 'R', '1') },
+    { AV_CODEC_ID_FFV1, MKTAG('F', 'F', 'V', '1') },
+    { AV_CODEC_ID_XAN_WC4, MKTAG('X', 'x', 'a', 'n') },
+    { AV_CODEC_ID_MSRLE, MKTAG('m', 'r', 'l', 'e') },
+    { AV_CODEC_ID_MSRLE, MKTAG(0x1, 0x0, 0x0, 0x0) },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('M', 'S', 'V', 'C') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('m', 's', 'v', 'c') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('C', 'R', 'A', 'M') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('c', 'r', 'a', 'm') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('W', 'H', 'A', 'M') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('w', 'h', 'a', 'm') },
+    { AV_CODEC_ID_CINEPAK, MKTAG('c', 'v', 'i', 'd') },
+    { AV_CODEC_ID_TRUEMOTION1, MKTAG('D', 'U', 'C', 'K') },
+    { AV_CODEC_ID_MSZH, MKTAG('M', 'S', 'Z', 'H') },
+    { AV_CODEC_ID_ZLIB, MKTAG('Z', 'L', 'I', 'B') },
+    { AV_CODEC_ID_4XM, MKTAG('4', 'X', 'M', 'V') },
+    { AV_CODEC_ID_FLV1, MKTAG('F', 'L', 'V', '1') },
+    { AV_CODEC_ID_SVQ1, MKTAG('s', 'v', 'q', '1') },
+    { AV_CODEC_ID_TSCC, MKTAG('t', 's', 'c', 'c') },
+    { AV_CODEC_ID_ULTI, MKTAG('U', 'L', 'T', 'I') },
+    { AV_CODEC_ID_VIXL, MKTAG('V', 'I', 'X', 'L') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', 'P', 'E', 'G') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', '1', '.', '0') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', '1', '.', '1') },
+    { AV_CODEC_ID_WMV3, MKTAG('W', 'M', 'V', '3') },
+    { AV_CODEC_ID_LOCO, MKTAG('L', 'O', 'C', 'O') },
+    { AV_CODEC_ID_THEORA, MKTAG('t', 'h', 'e', 'o') },
 #if LIBAVCODEC_VERSION_INT>0x000409
-    { CODEC_ID_WNV1, MKTAG('W', 'N', 'V', '1') },
-    { CODEC_ID_AASC, MKTAG('A', 'A', 'S', 'C') },
-    { CODEC_ID_INDEO2, MKTAG('R', 'T', '2', '1') },
-    { CODEC_ID_FRAPS, MKTAG('F', 'P', 'S', '1') },
-    { CODEC_ID_TRUEMOTION2, MKTAG('T', 'M', '2', '0') },
+    { AV_CODEC_ID_WNV1, MKTAG('W', 'N', 'V', '1') },
+    { AV_CODEC_ID_AASC, MKTAG('A', 'A', 'S', 'C') },
+    { AV_CODEC_ID_INDEO2, MKTAG('R', 'T', '2', '1') },
+    { AV_CODEC_ID_FRAPS, MKTAG('F', 'P', 'S', '1') },
+    { AV_CODEC_ID_TRUEMOTION2, MKTAG('T', 'M', '2', '0') },
 #endif
 #if LIBAVCODEC_VERSION_INT>((50<<16)+(1<<8)+0)
-    { CODEC_ID_FLASHSV, MKTAG('F', 'S', 'V', '1') },
-    { CODEC_ID_JPEGLS,MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - encoder */
-    { CODEC_ID_VC1, MKTAG('W', 'V', 'C', '1') },
-    { CODEC_ID_VC1, MKTAG('W', 'M', 'V', 'A') },
-    { CODEC_ID_CSCD, MKTAG('C', 'S', 'C', 'D') },
-    { CODEC_ID_ZMBV, MKTAG('Z', 'M', 'B', 'V') },
-    { CODEC_ID_KMVC, MKTAG('K', 'M', 'V', 'C') },
+    { AV_CODEC_ID_FLASHSV, MKTAG('F', 'S', 'V', '1') },
+    { AV_CODEC_ID_JPEGLS,MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - encoder */
+    { AV_CODEC_ID_VC1, MKTAG('W', 'V', 'C', '1') },
+    { AV_CODEC_ID_VC1, MKTAG('W', 'M', 'V', 'A') },
+    { AV_CODEC_ID_CSCD, MKTAG('C', 'S', 'C', 'D') },
+    { AV_CODEC_ID_ZMBV, MKTAG('Z', 'M', 'B', 'V') },
+    { AV_CODEC_ID_KMVC, MKTAG('K', 'M', 'V', 'C') },
 #endif
 #if LIBAVCODEC_VERSION_INT>((51<<16)+(11<<8)+0)
-    { CODEC_ID_VP5, MKTAG('V', 'P', '5', '0') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '0') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '1') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '2') },
-    { CODEC_ID_VP6F, MKTAG('V', 'P', '6', 'F') },
-    { CODEC_ID_JPEG2000, MKTAG('M', 'J', '2', 'C') },
-    { CODEC_ID_VMNC, MKTAG('V', 'M', 'n', 'c') },
+    { AV_CODEC_ID_VP5, MKTAG('V', 'P', '5', '0') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '0') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '1') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '2') },
+    { AV_CODEC_ID_VP6F, MKTAG('V', 'P', '6', 'F') },
+    { AV_CODEC_ID_JPEG2000, MKTAG('M', 'J', '2', 'C') },
+    { AV_CODEC_ID_VMNC, MKTAG('V', 'M', 'n', 'c') },
 #endif
 #if LIBAVCODEC_VERSION_INT>=((51<<16)+(49<<8)+0)
 // this tag seems not to exist in older versions of FFMPEG
-    { CODEC_ID_TARGA, MKTAG('t', 'g', 'a', ' ') },
+    { AV_CODEC_ID_TARGA, MKTAG('t', 'g', 'a', ' ') },
 #endif
-    { CODEC_ID_NONE, 0 },
+    { AV_CODEC_ID_NONE, 0 },
 };
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 2f95cb21d1..353ca19de6 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -132,6 +132,7 @@ public:
                         writer << img;
                     }
 
+                    writer.release();
                     if (!created) created = true;
                     else remove(filename.c_str());
                 }
diff --git a/modules/viz/src/vizimpl.cpp b/modules/viz/src/vizimpl.cpp
index ab621ad16d..b2ec7603fc 100644
--- a/modules/viz/src/vizimpl.cpp
+++ b/modules/viz/src/vizimpl.cpp
@@ -85,7 +85,7 @@ void cv::viz::Viz3d::VizImpl::TimerCallback::Execute(vtkObject* caller, unsigned
 
 void cv::viz::Viz3d::VizImpl::ExitCallback::Execute(vtkObject*, unsigned long event_id, void*)
 {
-    if (event_id == vtkCommand::ExitEvent)
+    if (event_id == vtkCommand::ExitEvent && viz->interactor_)
     {
         viz->interactor_->TerminateApp();
         viz->interactor_ = 0;
diff --git a/modules/world/CMakeLists.txt b/modules/world/CMakeLists.txt
index ea0df5bc98..8a4170a79b 100644
--- a/modules/world/CMakeLists.txt
+++ b/modules/world/CMakeLists.txt
@@ -9,6 +9,10 @@ else()
   set(OPENCV_WORLD_FLAGS_PROPERTY LINK_FLAGS)
 endif()
 
+function(include_one_module m)
+  include("${OPENCV_MODULE_${m}_LOCATION}/CMakeLists.txt")
+endfunction()
+
 if(NOT OPENCV_INITIAL_PASS)
   project(opencv_world)
 
@@ -18,7 +22,7 @@ if(NOT OPENCV_INITIAL_PASS)
       message(STATUS "    module ${m}...")
       set(CMAKE_CURRENT_SOURCE_DIR ${OPENCV_MODULE_${m}_LOCATION})
       #add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" ${CMAKE_CURRENT_BINARY_DIR}/${m})
-      include("${OPENCV_MODULE_${m}_LOCATION}/CMakeLists.txt")
+      include_one_module(${m})
     endif()
   endforeach()
   message(STATUS "Processing WORLD modules... DONE")
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
index 4f78c314a8..40adb98d74 100644
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.opencv.engine"
+    package="org.opencv.engine3"
     android:versionCode="300@ANDROID_PLATFORM_VERSION_CODE@"
     android:versionName="3.00" >
 
@@ -13,12 +13,12 @@
 
     <service android:exported="true" android:name="OpenCVEngineService" android:process=":OpenCVEngineProcess">
         <intent-filter>
-            <action android:name="org.opencv.engine.BIND"></action>
+            <action android:name="org.opencv.engine3.BIND"></action>
         </intent-filter>
     </service>
 
     <activity
-        android:name="org.opencv.engine.manager.ManagerActivity"
+        android:name="org.opencv.engine3.manager.ManagerActivity"
         android:label="@string/app_name"
         android:screenOrientation="portrait">
         <intent-filter>
diff --git a/platforms/android/service/engine/build.xml b/platforms/android/service/engine/build.xml
index 98ddc3eac1..47a283d8f4 100644
--- a/platforms/android/service/engine/build.xml
+++ b/platforms/android/service/engine/build.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project name="OpenCV Manager" default="help">
+<project name="OpenCV3 Manager" default="help">
 
     <!-- The local.properties file is created and updated by the 'android' tool.
          It contains the path to the SDK. It should *NOT* be checked into
diff --git a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
index 3e490a3160..0145abb8ce 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
@@ -3,23 +3,23 @@
 #include <jni.h>
 #include <string>
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetCpuID(JNIEnv* , jclass)
 {
     return GetCpuID();
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformName(JNIEnv* env, jclass)
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_HardwareDetector_GetPlatformName(JNIEnv* env, jclass)
 {
     std::string hardware_name = GetPlatformName();
     return env->NewStringUTF(hardware_name.c_str());
 }
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetProcessorCount(JNIEnv* , jclass)
 {
     return GetProcessorCount();
 }
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_DetectKnownPlatforms(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_DetectKnownPlatforms(JNIEnv* , jclass)
 {
     return DetectKnownPlatforms();
 }
diff --git a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
index 43fad33c86..61f294e99f 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
@@ -14,7 +14,7 @@ extern "C" {
  * Method:    GetCpuID
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetCpuID
 (JNIEnv *, jclass);
 
 /*
@@ -22,7 +22,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID
  * Method:    GetPlatformName
  * Signature: ()Ljava/lang/String;
  */
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_HardwareDetector_GetPlatformName
 (JNIEnv *, jclass);
 
 /*
@@ -30,7 +30,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformNam
  * Method:    GetProcessorCount
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetProcessorCount
 (JNIEnv *, jclass);
 
 /*
@@ -38,7 +38,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount
  * Method:    DetectKnownPlatforms
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_DetectKnownPlatforms
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_DetectKnownPlatforms
 (JNIEnv *, jclass);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
index dac4916563..1dd038125a 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
@@ -15,7 +15,7 @@ using namespace android;
 sp<IBinder> OpenCVEngineBinder = NULL;
 IPackageManager* PackageManager = NULL;
 
-JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect(JNIEnv* env, jobject)
+JNIEXPORT jobject JNICALL Java_org_opencv_engine3_BinderConnector_Connect(JNIEnv* env, jobject)
 {
     LOGI("Creating new component");
     if (NULL != OpenCVEngineBinder.get())
@@ -30,7 +30,7 @@ JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect(JNIEnv*
     return javaObjectForIBinder(env, OpenCVEngineBinder);
 }
 
-JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init(JNIEnv* env, jobject , jobject market)
+JNIEXPORT jboolean JNICALL Java_org_opencv_engine3_BinderConnector_Init(JNIEnv* env, jobject , jobject market)
 {
     LOGD("Java_org_opencv_engine_BinderConnector_Init");
 
@@ -58,7 +58,7 @@ JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init(JNIEnv* e
     }
 }
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_BinderConnector_Final(JNIEnv *, jobject)
+JNIEXPORT void JNICALL Java_org_opencv_engine3_BinderConnector_Final(JNIEnv *, jobject)
 {
     LOGD("Java_org_opencv_engine_BinderConnector_Final");
 
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
index cd0734eb07..bfeafb0498 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
@@ -12,7 +12,7 @@ extern "C" {
  * Method:    Connect
  * Signature: ()Landroid/os/IBinder;
  */
-JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect
+JNIEXPORT jobject JNICALL Java_org_opencv_engine3_BinderConnector_Connect
   (JNIEnv *, jobject);
 
 /*
@@ -20,7 +20,7 @@ JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect
  * Method:    Init
  * Signature: (Lorg/opencv/engine/MarketConnector;)Z
  */
-JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init
+JNIEXPORT jboolean JNICALL Java_org_opencv_engine3_BinderConnector_Init
   (JNIEnv *, jobject, jobject);
 
 /*
@@ -28,7 +28,7 @@ JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init
  * Method:    Final
  * Signature: ()V
  */
-JNIEXPORT void JNICALL Java_org_opencv_engine_BinderConnector_Final
+JNIEXPORT void JNICALL Java_org_opencv_engine3_BinderConnector_Final
   (JNIEnv *, jobject);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
index e7dc6d2f14..f1c5ec19a8 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
@@ -3,7 +3,7 @@
 #include <utils/Log.h>
 #include <dlfcn.h>
 
-JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
+JNIEXPORT jlong JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_open
   (JNIEnv * env, jobject, jstring str)
 {
     const char* infoLibPath = env->GetStringUTFChars(str, NULL);
@@ -21,7 +21,7 @@ JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
     return (jlong)handle;
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getPackageName
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -41,7 +41,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageNam
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getLibraryList
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -61,7 +61,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryLis
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getVersionName
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -81,7 +81,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionNam
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_close
+JNIEXPORT void JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_close
   (JNIEnv*, jobject, jlong handle)
 {
     dlclose((void*)handle);
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
index b02050ffde..574f0b4e0b 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
@@ -6,19 +6,19 @@
 extern "C" {
 #endif
 
-JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
+JNIEXPORT jlong JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_open
   (JNIEnv *, jobject, jstring);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getPackageName
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getLibraryList
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getVersionName
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_close
+JNIEXPORT void JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_close
   (JNIEnv *, jobject, jlong);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
index d831bf7a5c..d95f9f943e 100644
--- a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -27,7 +27,7 @@ map<int, string> PackageInfo::InitPlatformNameMap()
 
 const map<int, string> PackageInfo::PlatformNameMap = InitPlatformNameMap();
 const string PackageInfo::BasePackageName = "org.opencv.lib";
-const string  DEFAULT_ENGINE_INSTALL_PATH = "/data/data/org.opencv.engine";
+const string  DEFAULT_ENGINE_INSTALL_PATH = "/data/data/org.opencv.engine3";
 
 inline string JoinARMFeatures(int cpu_id)
 {
diff --git a/platforms/android/service/engine/jni/include/EngineCommon.h b/platforms/android/service/engine/jni/include/EngineCommon.h
index a03f02c68d..2948db8c1a 100644
--- a/platforms/android/service/engine/jni/include/EngineCommon.h
+++ b/platforms/android/service/engine/jni/include/EngineCommon.h
@@ -13,9 +13,9 @@
 #define LIB_OPENCV_INFO_NAME "libopencv_info.so"
 
 // OpenCV Manager package name
-#define OPENCV_ENGINE_PACKAGE "org.opencv.engine"
+#define OPENCV_ENGINE_PACKAGE "org.opencv.engine3"
 // Class name of OpenCV engine binder object. Is needned for connection to service
-#define OPECV_ENGINE_CLASSNAME "org.opencv.engine.OpenCVEngineInterface"
+#define OPECV_ENGINE_CLASSNAME "org.opencv.engine3.OpenCVEngineInterface"
 
 typedef const char* (*InfoFunctionType)();
 
diff --git a/platforms/android/service/engine/res/values/strings.xml b/platforms/android/service/engine/res/values/strings.xml
index a13432623f..d05333607a 100644
--- a/platforms/android/service/engine/res/values/strings.xml
+++ b/platforms/android/service/engine/res/values/strings.xml
@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
-    <string name="app_name">OpenCV Manager</string>
-</resources>
\ No newline at end of file
+    <string name="app_name">OpenCV3 Manager</string>
+</resources>
diff --git a/platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java b/platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
index bde54d5b96..a54843354e 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import android.os.IBinder;
 
diff --git a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
similarity index 98%
rename from platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
index f115070aad..65ee243d3f 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 public class HardwareDetector
 {
diff --git a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java b/platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
index da595915f2..4e5f51acc5 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -99,7 +99,7 @@ public class MarketConnector
         List<PackageInfo> AllPackages = mContext.getPackageManager().getInstalledPackages(PackageManager.GET_CONFIGURATIONS);
         List<PackageInfo> OpenCVPackages = new ArrayList<PackageInfo>();
         try {
-            OpenCVPackages.add(mContext.getPackageManager().getPackageInfo("org.opencv.engine", PackageManager.GET_CONFIGURATIONS));
+            OpenCVPackages.add(mContext.getPackageManager().getPackageInfo("org.opencv.engine3", PackageManager.GET_CONFIGURATIONS));
         } catch (NameNotFoundException e) {
             Log.e(TAG, "OpenCV Manager package info was not found!");
             e.printStackTrace();
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
index 13e0f7f84f..2b957d4b07 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 /**
 * Class provides Java interface to OpenCV Engine Service. Is synchronious with native OpenCVEngine class.
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
similarity index 98%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
index b3c4ea0575..c7df4a8117 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import android.app.Service;
 import android.content.Intent;
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
index d0f67bfde0..cc36b152ad 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 public class OpenCVLibraryInfo {
     public OpenCVLibraryInfo(String packagePath) {
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
rename to platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
index b4e0be5a91..7308e848eb 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
@@ -1,15 +1,15 @@
-package org.opencv.engine.manager;
+package org.opencv.engine3.manager;
 
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.StringTokenizer;
 
-import org.opencv.engine.HardwareDetector;
-import org.opencv.engine.MarketConnector;
-import org.opencv.engine.OpenCVEngineInterface;
-import org.opencv.engine.OpenCVEngineService;
-import org.opencv.engine.OpenCVLibraryInfo;
-import org.opencv.engine.R;
+import org.opencv.engine3.HardwareDetector;
+import org.opencv.engine3.MarketConnector;
+import org.opencv.engine3.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineService;
+import org.opencv.engine3.OpenCVLibraryInfo;
+import org.opencv.engine3.R;
 import android.annotation.TargetApi;
 import android.app.Activity;
 import android.app.AlertDialog;
@@ -161,7 +161,7 @@ public class ManagerActivity extends Activity
         mUpdateEngineButton.setOnClickListener(new OnClickListener() {
 
             public void onClick(View v) {
-                if (!mMarket.InstallAppFromMarket("org.opencv.engine"))
+                if (!mMarket.InstallAppFromMarket("org.opencv.engine3"))
                 {
                     Toast toast = Toast.makeText(getApplicationContext(), "Google Play is not avaliable", Toast.LENGTH_SHORT);
                     toast.show();
@@ -207,7 +207,7 @@ public class ManagerActivity extends Activity
 
             public void onItemClick(AdapterView<?> adapter, View view, int position, long id) {
                 //if (!mListViewItems.get((int) id).get("Name").equals("Built-in OpenCV library"));
-                if (!mInstalledPackageInfo[(int) id].packageName.equals("org.opencv.engine"))
+                if (!mInstalledPackageInfo[(int) id].packageName.equals("org.opencv.engine3"))
                 {
                     mInstalledPackageView.setTag(Integer.valueOf((int)id));
                     mActionDialog.show();
@@ -221,7 +221,7 @@ public class ManagerActivity extends Activity
             public void onReceive(Context context, Intent intent) {
                 Log.d("OpenCVManager/Receiver", "Broadcast message " + intent.getAction() + " receiver");
                 Log.d("OpenCVManager/Receiver", "Filling package list on broadcast message");
-                if (!bindService(new Intent("org.opencv.engine.BIND"),
+                if (!bindService(new Intent("org.opencv.engine3.BIND"),
                      new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
                 {
                     TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
@@ -350,7 +350,7 @@ public class ManagerActivity extends Activity
                 else
                     NativeLibDir = "/data/data/" + mInstalledPackageInfo[i].packageName + "/lib";
 
-                if (PackageName.equals("org.opencv.engine"))
+                if (PackageName.equals("org.opencv.engine3"))
                 {
                     OpenCVLibraryInfo NativeInfo = new OpenCVLibraryInfo(NativeLibDir);
                     if (NativeInfo.status())
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java b/platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
similarity index 96%
rename from platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
rename to platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
index 17707efd36..4a929ae952 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
@@ -1,4 +1,4 @@
-package org.opencv.engine.manager;
+package org.opencv.engine3.manager;
 
 import java.util.List;
 import java.util.Map;
diff --git a/platforms/android/service/engine_test/AndroidManifest.xml b/platforms/android/service/engine_test/AndroidManifest.xml
index 5779d90a62..5ee354225a 100644
--- a/platforms/android/service/engine_test/AndroidManifest.xml
+++ b/platforms/android/service/engine_test/AndroidManifest.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.opencv.engine.test"
+    package="org.opencv.engine3.test"
     android:versionCode="1"
     android:versionName="1.0" >
 
@@ -8,7 +8,7 @@
 
     <instrumentation
         android:name="android.test.InstrumentationTestRunner"
-        android:targetPackage="org.opencv.engine" />
+        android:targetPackage="org.opencv.engine3" />
 
     <application
         android:icon="@drawable/ic_launcher"
@@ -17,4 +17,4 @@
 
     </application>
 
-</manifest>
\ No newline at end of file
+</manifest>
diff --git a/platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java b/platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
similarity index 91%
rename from platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
rename to platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
index d67f22faf2..b45af98a37 100644
--- a/platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
+++ b/platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
@@ -1,7 +1,7 @@
-package org.opencv.engine.test;
+package org.opencv.engine3.test;
 
-import org.opencv.engine.OpenCVEngineInterface;
-import org.opencv.engine.OpenCVEngineService;
+import org.opencv.engine3.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineService;
 
 import android.content.Intent;
 import android.os.IBinder;
@@ -18,7 +18,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine.BIND"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3.BIND"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -28,7 +28,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testInstallVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -37,7 +37,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testGetPathForExistVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -46,7 +46,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testGetPathForUnExistVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -55,7 +55,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testInstallAndGetVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 4dfb65d21f..ff5caa9fa8 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   project(cpp_samples)
 
   ocv_include_directories("${OpenCV_SOURCE_DIR}/include")#for opencv.hpp
-  ocv_include_modules(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
+  ocv_include_modules_recurse(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 
   if(HAVE_opencv_cudaoptflow)
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/cudaoptflow/include")
diff --git a/samples/cpp/example_cmake/CMakeLists.txt b/samples/cpp/example_cmake/CMakeLists.txt
new file mode 100644
index 0000000000..fe7e629812
--- /dev/null
+++ b/samples/cpp/example_cmake/CMakeLists.txt
@@ -0,0 +1,28 @@
+# cmake needs this line
+cmake_minimum_required(VERSION 2.8)
+
+# Define project name
+project(opencv_example_project)
+
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+find_package(OpenCV REQUIRED)
+
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+# Add OpenCV headers location to your include paths
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+# Declare the executable target built from your sources
+add_executable(opencv_example example.cpp)
+
+# Link your application with OpenCV libraries
+target_link_libraries(opencv_example ${OpenCV_LIBS})
diff --git a/samples/cpp/example_cmake/example.cpp b/samples/cpp/example_cmake/example.cpp
new file mode 100644
index 0000000000..cac5050b2d
--- /dev/null
+++ b/samples/cpp/example_cmake/example.cpp
@@ -0,0 +1,50 @@
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/videoio.hpp"
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+void drawText(Mat & image);
+
+int main()
+{
+    cout << "Built with OpenCV " << CV_VERSION << endl;
+    Mat image;
+    VideoCapture capture;
+    capture.open(0);
+    if(capture.isOpened())
+    {
+        cout << "Capture is opened" << endl;
+        for(;;)
+        {
+            capture >> image;
+            if(image.empty())
+                break;
+            drawText(image);
+            imshow("Sample", image);
+            if(waitKey(10) >= 0)
+                break;
+        }
+    }
+    else
+    {
+        cout << "No capture" << endl;
+        image = Mat::zeros(480, 640, CV_8UC1);
+        drawText(image);
+        imshow("Sample", image);
+        waitKey(0);
+    }
+    return 0;
+}
+
+void drawText(Mat & image)
+{
+    putText(image, "Hello OpenCV",
+            Point(20, 50),
+            FONT_HERSHEY_COMPLEX, 1, // font face and scale
+            Scalar(255, 255, 255), // white
+            1, LINE_AA); // line thickness and type
+}
diff --git a/samples/cpp/openni_capture.cpp b/samples/cpp/openni_capture.cpp
index 64aa90bc69..09f1d21e09 100644
--- a/samples/cpp/openni_capture.cpp
+++ b/samples/cpp/openni_capture.cpp
@@ -13,14 +13,14 @@ static void help()
                         "The user gets some of the supported output images.\n"
             "\nAll supported output map types:\n"
             "1.) Data given from depth generator\n"
-            "   CV_CAP_OPENNI_DEPTH_MAP            - depth values in mm (CV_16UC1)\n"
-            "   CV_CAP_OPENNI_POINT_CLOUD_MAP      - XYZ in meters (CV_32FC3)\n"
-            "   CV_CAP_OPENNI_DISPARITY_MAP        - disparity in pixels (CV_8UC1)\n"
-            "   CV_CAP_OPENNI_DISPARITY_MAP_32F    - disparity in pixels (CV_32FC1)\n"
-            "   CV_CAP_OPENNI_VALID_DEPTH_MASK     - mask of valid pixels (not ocluded, not shaded etc.) (CV_8UC1)\n"
+            "   CAP_OPENNI_DEPTH_MAP            - depth values in mm (CV_16UC1)\n"
+            "   CAP_OPENNI_POINT_CLOUD_MAP      - XYZ in meters (CV_32FC3)\n"
+            "   CAP_OPENNI_DISPARITY_MAP        - disparity in pixels (CV_8UC1)\n"
+            "   CAP_OPENNI_DISPARITY_MAP_32F    - disparity in pixels (CV_32FC1)\n"
+            "   CAP_OPENNI_VALID_DEPTH_MASK     - mask of valid pixels (not ocluded, not shaded etc.) (CV_8UC1)\n"
             "2.) Data given from RGB image generator\n"
-            "   CV_CAP_OPENNI_BGR_IMAGE            - color image (CV_8UC3)\n"
-            "   CV_CAP_OPENNI_GRAY_IMAGE           - gray image (CV_8UC1)\n"
+            "   CAP_OPENNI_BGR_IMAGE            - color image (CV_8UC3)\n"
+            "   CAP_OPENNI_GRAY_IMAGE           - gray image (CV_8UC1)\n"
          << endl;
 }
 
@@ -89,8 +89,8 @@ static void printCommandLineParams()
 {
     cout << "-cd       Colorized disparity? (0 or 1; 1 by default) Ignored if disparity map is not selected to show." << endl;
     cout << "-fmd      Fixed max disparity? (0 or 1; 0 by default) Ignored if disparity map is not colorized (-cd 0)." << endl;
-    cout << "-mode     image mode: resolution and fps, supported three values:  0 - CV_CAP_OPENNI_VGA_30HZ, 1 - CV_CAP_OPENNI_SXGA_15HZ," << endl;
-    cout << "          2 - CV_CAP_OPENNI_SXGA_30HZ (0 by default). Ignored if rgb image or gray image are not selected to show." << endl;
+    cout << "-mode     image mode: resolution and fps, supported three values:  0 - CAP_OPENNI_VGA_30HZ, 1 - CAP_OPENNI_SXGA_15HZ," << endl;
+    cout << "          2 - CAP_OPENNI_SXGA_30HZ (0 by default). Ignored if rgb image or gray image are not selected to show." << endl;
     cout << "-m        Mask to set which output images are need. It is a string of size 5. Each element of this is '0' or '1' and" << endl;
     cout << "          determine: is depth map, disparity map, valid pixels mask, rgb image, gray image need or not (correspondently)?" << endl ;
     cout << "          By default -m 01010 i.e. disparity map and rgb image will be shown." << endl ;
diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
index cd29b1c2e7..6a6de95394 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
@@ -27,8 +27,13 @@ void thresh_callback(int, void* );
  */
 int main( int, char** argv )
 {
-  /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  /// Load source image
+  src = imread(argv[1]);
+  if (src.empty())
+  {
+    cerr << "No image supplied ..." << endl;
+    return -1;
+  }
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );
diff --git a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
index 34e2504c6e..8059a4aec5 100644
--- a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
@@ -34,7 +34,8 @@ public:
 
     void write(FileStorage& fs) const                        //Write serialization for this class
     {
-        fs << "{" << "BoardSize_Width"  << boardSize.width
+        fs << "{"
+                  << "BoardSize_Width"  << boardSize.width
                   << "BoardSize_Height" << boardSize.height
                   << "Square_Size"         << squareSize
                   << "Calibrate_Pattern" << patternToUse
@@ -43,8 +44,8 @@ public:
                   << "Calibrate_AssumeZeroTangentialDistortion" << calibZeroTangentDist
                   << "Calibrate_FixPrincipalPointAtTheCenter" << calibFixPrincipalPoint
 
-                  << "Write_DetectedFeaturePoints" << bwritePoints
-                  << "Write_extrinsicParameters"   << bwriteExtrinsics
+                  << "Write_DetectedFeaturePoints" << writePoints
+                  << "Write_extrinsicParameters"   << writeExtrinsics
                   << "Write_outputFileName"  << outputFileName
 
                   << "Show_UndistortedImage" << showUndistorsed
@@ -62,8 +63,8 @@ public:
         node["Square_Size"]  >> squareSize;
         node["Calibrate_NrOfFrameToUse"] >> nrFrames;
         node["Calibrate_FixAspectRatio"] >> aspectRatio;
-        node["Write_DetectedFeaturePoints"] >> bwritePoints;
-        node["Write_extrinsicParameters"] >> bwriteExtrinsics;
+        node["Write_DetectedFeaturePoints"] >> writePoints;
+        node["Write_extrinsicParameters"] >> writeExtrinsics;
         node["Write_outputFileName"] >> outputFileName;
         node["Calibrate_AssumeZeroTangentialDistortion"] >> calibZeroTangentDist;
         node["Calibrate_FixPrincipalPointAtTheCenter"] >> calibFixPrincipalPoint;
@@ -71,9 +72,9 @@ public:
         node["Show_UndistortedImage"] >> showUndistorsed;
         node["Input"] >> input;
         node["Input_Delay"] >> delay;
-        interprate();
+        validate();
     }
-    void interprate()
+    void validate()
     {
         goodInput = true;
         if (boardSize.width <= 0 || boardSize.height <= 0)
@@ -105,10 +106,10 @@ public:
             else
             {
                 if (readStringList(input, imageList))
-                    {
-                        inputType = IMAGE_LIST;
-                        nrFrames = (nrFrames < (int)imageList.size()) ? nrFrames : (int)imageList.size();
-                    }
+                {
+                    inputType = IMAGE_LIST;
+                    nrFrames = (nrFrames < (int)imageList.size()) ? nrFrames : (int)imageList.size();
+                }
                 else
                     inputType = VIDEO_FILE;
             }
@@ -121,7 +122,7 @@ public:
         }
         if (inputType == INVALID)
         {
-            cerr << " Inexistent input: " << input;
+            cerr << " Input does not exist: " << input;
             goodInput = false;
         }
 
@@ -136,10 +137,10 @@ public:
         if (!patternToUse.compare("CIRCLES_GRID")) calibrationPattern = CIRCLES_GRID;
         if (!patternToUse.compare("ASYMMETRIC_CIRCLES_GRID")) calibrationPattern = ASYMMETRIC_CIRCLES_GRID;
         if (calibrationPattern == NOT_EXISTING)
-            {
-                cerr << " Inexistent camera calibration mode: " << patternToUse << endl;
-                goodInput = false;
-            }
+        {
+            cerr << " Camera calibration mode does not exist: " << patternToUse << endl;
+            goodInput = false;
+        }
         atImageList = 0;
 
     }
@@ -152,7 +153,7 @@ public:
             inputCapture >> view0;
             view0.copyTo(result);
         }
-        else if( atImageList < (int)imageList.size() )
+        else if( atImageList < imageList.size() )
             result = imread(imageList[atImageList++], IMREAD_COLOR);
 
         return result;
@@ -173,26 +174,24 @@ public:
         return true;
     }
 public:
-    Size boardSize;            // The size of the board -> Number of items by width and height
-    Pattern calibrationPattern;// One of the Chessboard, circles, or asymmetric circle pattern
-    float squareSize;          // The size of a square in your defined unit (point, millimeter,etc).
-    int nrFrames;              // The number of frames to use from the input for calibration
-    float aspectRatio;         // The aspect ratio
-    int delay;                 // In case of a video input
-    bool bwritePoints;         //  Write detected feature points
-    bool bwriteExtrinsics;     // Write extrinsic parameters
-    bool calibZeroTangentDist; // Assume zero tangential distortion
-    bool calibFixPrincipalPoint;// Fix the principal point at the center
-    bool flipVertical;          // Flip the captured images around the horizontal axis
-    string outputFileName;      // The name of the file where to write
-    bool showUndistorsed;       // Show undistorted images after calibration
-    string input;               // The input ->
-
-
+    Size boardSize;              // The size of the board -> Number of items by width and height
+    Pattern calibrationPattern;  // One of the Chessboard, circles, or asymmetric circle pattern
+    float squareSize;            // The size of a square in your defined unit (point, millimeter,etc).
+    int nrFrames;                // The number of frames to use from the input for calibration
+    float aspectRatio;           // The aspect ratio
+    int delay;                   // In case of a video input
+    bool writePoints;            // Write detected feature points
+    bool writeExtrinsics;        // Write extrinsic parameters
+    bool calibZeroTangentDist;   // Assume zero tangential distortion
+    bool calibFixPrincipalPoint; // Fix the principal point at the center
+    bool flipVertical;           // Flip the captured images around the horizontal axis
+    string outputFileName;       // The name of the file where to write
+    bool showUndistorsed;        // Show undistorted images after calibration
+    string input;                // The input ->
 
     int cameraID;
     vector<string> imageList;
-    int atImageList;
+    size_t atImageList;
     VideoCapture inputCapture;
     InputType inputType;
     bool goodInput;
@@ -204,7 +203,7 @@ private:
 
 };
 
-static void read(const FileNode& node, Settings& x, const Settings& default_value = Settings())
+static inline void read(const FileNode& node, Settings& x, const Settings& default_value = Settings())
 {
     if(node.empty())
         x = default_value;
@@ -212,6 +211,11 @@ static void read(const FileNode& node, Settings& x, const Settings& default_valu
         x.read(node);
 }
 
+static inline void write(FileStorage& fs, const String&, const Settings& s )
+{
+    s.write(fs);
+}
+
 enum { DETECTION = 0, CAPTURING = 1, CALIBRATED = 2 };
 
 bool runCalibrationAndSave(Settings& s, Size imageSize, Mat&  cameraMatrix, Mat& distCoeffs,
@@ -220,6 +224,8 @@ bool runCalibrationAndSave(Settings& s, Size imageSize, Mat&  cameraMatrix, Mat&
 int main(int argc, char* argv[])
 {
     help();
+
+    //! [file_read]
     Settings s;
     const string inputSettingsFile = argc > 1 ? argv[1] : "default.xml";
     FileStorage fs(inputSettingsFile, FileStorage::READ); // Read the settings
@@ -230,6 +236,10 @@ int main(int argc, char* argv[])
     }
     fs["Settings"] >> s;
     fs.release();                                         // close Settings file
+    //! [file_read]
+
+    //FileStorage fout("settings.yml", FileStorage::WRITE); // write config as YAML
+    //fout << "Settings" << s;
 
     if (!s.goodInput)
     {
@@ -245,32 +255,35 @@ int main(int argc, char* argv[])
     const Scalar RED(0,0,255), GREEN(0,255,0);
     const char ESC_KEY = 27;
 
-    for(int i = 0;;++i)
+    //! [get_input]
+    for(;;)
     {
-      Mat view;
-      bool blinkOutput = false;
+        Mat view;
+        bool blinkOutput = false;
 
-      view = s.nextImage();
+        view = s.nextImage();
 
-      //-----  If no more image, or got enough, then stop calibration and show result -------------
-      if( mode == CAPTURING && imagePoints.size() >= (unsigned)s.nrFrames )
-      {
+        //-----  If no more image, or got enough, then stop calibration and show result -------------
+        if( mode == CAPTURING && imagePoints.size() >= (size_t)s.nrFrames )
+        {
           if( runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints))
               mode = CALIBRATED;
           else
               mode = DETECTION;
-      }
-      if(view.empty())          // If no more images then run calibration, save and stop loop.
-      {
-            if( imagePoints.size() > 0 )
+        }
+        if(view.empty())          // If there are no more images stop the loop
+        {
+            // if calibration threshold was not reached yet, calibrate now
+            if( mode != CALIBRATED && !imagePoints.empty() )
                 runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints);
             break;
-      }
-
+        }
+        //! [get_input]
 
         imageSize = view.size();  // Format input image.
         if( s.flipVertical )    flip( view, view, 0 );
 
+        //! [find_pattern]
         vector<Point2f> pointBuf;
 
         bool found;
@@ -290,7 +303,8 @@ int main(int argc, char* argv[])
             found = false;
             break;
         }
-
+        //! [find_pattern]
+        //! [pattern_found]
         if ( found)                // If done with success,
         {
               // improve the found corners' coordinate accuracy for chessboard
@@ -313,8 +327,9 @@ int main(int argc, char* argv[])
                 // Draw the corners.
                 drawChessboardCorners( view, s.boardSize, Mat(pointBuf), found );
         }
-
+        //! [pattern_found]
         //----------------------------- Output Text ------------------------------------------------
+        //! [output_text]
         string msg = (mode == CAPTURING) ? "100/100" :
                       mode == CALIBRATED ? "Calibrated" : "Press 'g' to start";
         int baseLine = 0;
@@ -333,15 +348,17 @@ int main(int argc, char* argv[])
 
         if( blinkOutput )
             bitwise_not(view, view);
-
+        //! [output_text]
         //------------------------- Video capture  output  undistorted ------------------------------
+        //! [output_undistorted]
         if( mode == CALIBRATED && s.showUndistorsed )
         {
             Mat temp = view.clone();
             undistort(temp, view, cameraMatrix, distCoeffs);
         }
-
+        //! [output_undistorted]
         //------------------------------ Show image and check for input commands -------------------
+        //! [await_input]
         imshow("Image View", view);
         char key = (char)waitKey(s.inputCapture.isOpened() ? 50 : s.delay);
 
@@ -356,9 +373,11 @@ int main(int argc, char* argv[])
             mode = CAPTURING;
             imagePoints.clear();
         }
+        //! [await_input]
     }
 
     // -----------------------Show the undistorted image for the image list ------------------------
+    //! [show_results]
     if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed )
     {
         Mat view, rview, map1, map2;
@@ -366,7 +385,7 @@ int main(int argc, char* argv[])
             getOptimalNewCameraMatrix(cameraMatrix, distCoeffs, imageSize, 1, imageSize, 0),
             imageSize, CV_16SC2, map1, map2);
 
-        for(int i = 0; i < (int)s.imageList.size(); i++ )
+        for(size_t i = 0; i < s.imageList.size(); i++ )
         {
             view = imread(s.imageList[i], 1);
             if(view.empty())
@@ -378,11 +397,12 @@ int main(int argc, char* argv[])
                 break;
         }
     }
-
+    //! [show_results]
 
     return 0;
 }
 
+//! [compute_errors]
 static double computeReprojectionErrors( const vector<vector<Point3f> >& objectPoints,
                                          const vector<vector<Point2f> >& imagePoints,
                                          const vector<Mat>& rvecs, const vector<Mat>& tvecs,
@@ -390,17 +410,16 @@ static double computeReprojectionErrors( const vector<vector<Point3f> >& objectP
                                          vector<float>& perViewErrors)
 {
     vector<Point2f> imagePoints2;
-    int i, totalPoints = 0;
+    size_t totalPoints = 0;
     double totalErr = 0, err;
     perViewErrors.resize(objectPoints.size());
 
-    for( i = 0; i < (int)objectPoints.size(); ++i )
+    for(size_t i = 0; i < objectPoints.size(); ++i )
     {
-        projectPoints( Mat(objectPoints[i]), rvecs[i], tvecs[i], cameraMatrix,
-                       distCoeffs, imagePoints2);
-        err = norm(Mat(imagePoints[i]), Mat(imagePoints2), NORM_L2);
+        projectPoints(objectPoints[i], rvecs[i], tvecs[i], cameraMatrix, distCoeffs, imagePoints2);
+        err = norm(imagePoints[i], imagePoints2, NORM_L2);
 
-        int n = (int)objectPoints[i].size();
+        size_t n = objectPoints[i].size();
         perViewErrors[i] = (float) std::sqrt(err*err/n);
         totalErr        += err*err;
         totalPoints     += n;
@@ -408,7 +427,8 @@ static double computeReprojectionErrors( const vector<vector<Point3f> >& objectP
 
     return std::sqrt(totalErr/totalPoints);
 }
-
+//! [compute_errors]
+//! [board_corners]
 static void calcBoardCornerPositions(Size boardSize, float squareSize, vector<Point3f>& corners,
                                      Settings::Pattern patternType /*= Settings::CHESSBOARD*/)
 {
@@ -420,28 +440,28 @@ static void calcBoardCornerPositions(Size boardSize, float squareSize, vector<Po
     case Settings::CIRCLES_GRID:
         for( int i = 0; i < boardSize.height; ++i )
             for( int j = 0; j < boardSize.width; ++j )
-                corners.push_back(Point3f(float( j*squareSize ), float( i*squareSize ), 0));
+                corners.push_back(Point3f(j*squareSize, i*squareSize, 0));
         break;
 
     case Settings::ASYMMETRIC_CIRCLES_GRID:
         for( int i = 0; i < boardSize.height; i++ )
             for( int j = 0; j < boardSize.width; j++ )
-                corners.push_back(Point3f(float((2*j + i % 2)*squareSize), float(i*squareSize), 0));
+                corners.push_back(Point3f((2*j + i % 2)*squareSize, i*squareSize, 0));
         break;
     default:
         break;
     }
 }
-
+//! [board_corners]
 static bool runCalibration( Settings& s, Size& imageSize, Mat& cameraMatrix, Mat& distCoeffs,
                             vector<vector<Point2f> > imagePoints, vector<Mat>& rvecs, vector<Mat>& tvecs,
                             vector<float>& reprojErrs,  double& totalAvgErr)
 {
-
+    //! [fixed_aspect]
     cameraMatrix = Mat::eye(3, 3, CV_64F);
     if( s.flag & CALIB_FIX_ASPECT_RATIO )
-        cameraMatrix.at<double>(0,0) = 1.0;
-
+        cameraMatrix.at<double>(0,0) = s.aspectRatio;
+    //! [fixed_aspect]
     distCoeffs = Mat::zeros(8, 1, CV_64F);
 
     vector<vector<Point3f> > objectPoints(1);
@@ -475,49 +495,48 @@ static void saveCameraParams( Settings& s, Size& imageSize, Mat& cameraMatrix, M
     time( &tm );
     struct tm *t2 = localtime( &tm );
     char buf[1024];
-    strftime( buf, sizeof(buf)-1, "%c", t2 );
+    strftime( buf, sizeof(buf), "%c", t2 );
 
-    fs << "calibration_Time" << buf;
+    fs << "calibration_time" << buf;
 
     if( !rvecs.empty() || !reprojErrs.empty() )
-        fs << "nrOfFrames" << (int)std::max(rvecs.size(), reprojErrs.size());
-    fs << "image_Width" << imageSize.width;
-    fs << "image_Height" << imageSize.height;
-    fs << "board_Width" << s.boardSize.width;
-    fs << "board_Height" << s.boardSize.height;
-    fs << "square_Size" << s.squareSize;
+        fs << "nr_of_frames" << (int)std::max(rvecs.size(), reprojErrs.size());
+    fs << "image_width" << imageSize.width;
+    fs << "image_height" << imageSize.height;
+    fs << "board_width" << s.boardSize.width;
+    fs << "board_height" << s.boardSize.height;
+    fs << "square_size" << s.squareSize;
 
     if( s.flag & CALIB_FIX_ASPECT_RATIO )
-        fs << "FixAspectRatio" << s.aspectRatio;
+        fs << "fix_aspect_ratio" << s.aspectRatio;
 
-    if( s.flag )
+    if (s.flag)
     {
-        sprintf( buf, "flags: %s%s%s%s",
-            s.flag & CALIB_USE_INTRINSIC_GUESS ? " +use_intrinsic_guess" : "",
-            s.flag & CALIB_FIX_ASPECT_RATIO ? " +fix_aspectRatio" : "",
-            s.flag & CALIB_FIX_PRINCIPAL_POINT ? " +fix_principal_point" : "",
-            s.flag & CALIB_ZERO_TANGENT_DIST ? " +zero_tangent_dist" : "" );
-        //cvWriteComment( *fs, buf, 0 );
-
+        sprintf(buf, "flags: %s%s%s%s",
+                s.flag & CALIB_USE_INTRINSIC_GUESS ? " +use_intrinsic_guess" : "",
+                s.flag & CALIB_FIX_ASPECT_RATIO ? " +fix_aspect_ratio" : "",
+                s.flag & CALIB_FIX_PRINCIPAL_POINT ? " +fix_principal_point" : "",
+                s.flag & CALIB_ZERO_TANGENT_DIST ? " +zero_tangent_dist" : "");
+        cvWriteComment(*fs, buf, 0);
     }
 
-    fs << "flagValue" << s.flag;
+    fs << "flags" << s.flag;
 
-    fs << "Camera_Matrix" << cameraMatrix;
-    fs << "Distortion_Coefficients" << distCoeffs;
+    fs << "camera_matrix" << cameraMatrix;
+    fs << "distortion_coefficients" << distCoeffs;
 
-    fs << "Avg_Reprojection_Error" << totalAvgErr;
-    if( !reprojErrs.empty() )
-        fs << "Per_View_Reprojection_Errors" << Mat(reprojErrs);
+    fs << "avg_reprojection_error" << totalAvgErr;
+    if (s.writeExtrinsics && !reprojErrs.empty())
+        fs << "per_view_reprojection_errors" << Mat(reprojErrs);
 
-    if( !rvecs.empty() && !tvecs.empty() )
+    if(s.writeExtrinsics && !rvecs.empty() && !tvecs.empty() )
     {
         CV_Assert(rvecs[0].type() == tvecs[0].type());
         Mat bigmat((int)rvecs.size(), 6, rvecs[0].type());
-        for( int i = 0; i < (int)rvecs.size(); i++ )
+        for( size_t i = 0; i < rvecs.size(); i++ )
         {
-            Mat r = bigmat(Range(i, i+1), Range(0,3));
-            Mat t = bigmat(Range(i, i+1), Range(3,6));
+            Mat r = bigmat(Range(int(i), int(i+1)), Range(0,3));
+            Mat t = bigmat(Range(int(i), int(i+1)), Range(3,6));
 
             CV_Assert(rvecs[i].rows == 3 && rvecs[i].cols == 1);
             CV_Assert(tvecs[i].rows == 3 && tvecs[i].cols == 1);
@@ -526,35 +545,38 @@ static void saveCameraParams( Settings& s, Size& imageSize, Mat& cameraMatrix, M
             t = tvecs[i].t();
         }
         //cvWriteComment( *fs, "a set of 6-tuples (rotation vector + translation vector) for each view", 0 );
-        fs << "Extrinsic_Parameters" << bigmat;
+        fs << "extrinsic_parameters" << bigmat;
     }
 
-    if( !imagePoints.empty() )
+    if(s.writePoints && !imagePoints.empty() )
     {
         Mat imagePtMat((int)imagePoints.size(), (int)imagePoints[0].size(), CV_32FC2);
-        for( int i = 0; i < (int)imagePoints.size(); i++ )
+        for( size_t i = 0; i < imagePoints.size(); i++ )
         {
-            Mat r = imagePtMat.row(i).reshape(2, imagePtMat.cols);
+            Mat r = imagePtMat.row(int(i)).reshape(2, imagePtMat.cols);
             Mat imgpti(imagePoints[i]);
             imgpti.copyTo(r);
         }
-        fs << "Image_points" << imagePtMat;
+        fs << "image_points" << imagePtMat;
     }
 }
 
-bool runCalibrationAndSave(Settings& s, Size imageSize, Mat&  cameraMatrix, Mat& distCoeffs,vector<vector<Point2f> > imagePoints )
+//! [run_and_save]
+bool runCalibrationAndSave(Settings& s, Size imageSize, Mat& cameraMatrix, Mat& distCoeffs,
+                           vector<vector<Point2f> > imagePoints)
 {
     vector<Mat> rvecs, tvecs;
     vector<float> reprojErrs;
     double totalAvgErr = 0;
 
-    bool ok = runCalibration(s,imageSize, cameraMatrix, distCoeffs, imagePoints, rvecs, tvecs,
-                             reprojErrs, totalAvgErr);
+    bool ok = runCalibration(s, imageSize, cameraMatrix, distCoeffs, imagePoints, rvecs, tvecs, reprojErrs,
+                             totalAvgErr);
     cout << (ok ? "Calibration succeeded" : "Calibration failed")
-        << ". avg re projection error = "  << totalAvgErr ;
+         << ". avg re projection error = " << totalAvgErr << endl;
 
-    if( ok )
-        saveCameraParams( s, imageSize, cameraMatrix, distCoeffs, rvecs ,tvecs, reprojErrs,
-                            imagePoints, totalAvgErr);
+    if (ok)
+        saveCameraParams(s, imageSize, cameraMatrix, distCoeffs, rvecs, tvecs, reprojErrs, imagePoints,
+                         totalAvgErr);
     return ok;
 }
+//! [run_and_save]
diff --git a/samples/directx/CMakeLists.txt b/samples/directx/CMakeLists.txt
index 15a6575374..21ada16ed8 100644
--- a/samples/directx/CMakeLists.txt
+++ b/samples/directx/CMakeLists.txt
@@ -8,7 +8,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
 
   project("${project}_samples")
 
-  ocv_include_modules(${OPENCV_DIRECTX_SAMPLES_REQUIRED_DEPS})
+  ocv_include_modules_recurse(${OPENCV_DIRECTX_SAMPLES_REQUIRED_DEPS})
 
   # ---------------------------------------------
   #      Define executable targets
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 8741f11701..32c53ecf1f 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -13,7 +13,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
 
   project("${project}_samples")
 
-  ocv_include_modules(${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
+  ocv_include_modules_recurse(${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
   ocv_include_directories(
     "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia"
     "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia/core"
diff --git a/samples/gpu/video_writer.cpp b/samples/gpu/video_writer.cpp
index 5a48c69aab..6c5d1412d6 100644
--- a/samples/gpu/video_writer.cpp
+++ b/samples/gpu/video_writer.cpp
@@ -69,7 +69,8 @@ int main(int argc, const char* argv[])
         {
             std::cout << "Open CUDA Writer" << std::endl;
 
-            d_writer = cv::cudacodec::createVideoWriter("output_gpu.avi", frame.size(), FPS);
+            const cv::String outputFilename = "output_gpu.avi";
+            d_writer = cv::cudacodec::createVideoWriter(outputFilename, frame.size(), FPS);
         }
 
         d_frame.upload(frame);
diff --git a/samples/tapi/CMakeLists.txt b/samples/tapi/CMakeLists.txt
index 9c69ab0a25..6f8d873efe 100644
--- a/samples/tapi/CMakeLists.txt
+++ b/samples/tapi/CMakeLists.txt
@@ -8,7 +8,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
 
   project("${project}_samples")
 
-  ocv_include_modules(${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
+  ocv_include_modules_recurse(${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 
   # ---------------------------------------------
   #      Define executable targets
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
index 2e91eb156b..fc7440fb29 100644
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.cpp
@@ -10,6 +10,10 @@
 #include <Robuffer.h>
 #include <vector>
 #include <opencv2\imgproc\types_c.h>
+#include <opencv2\imgcodecs\imgcodecs.hpp>
+#include <opencv2\core\core.hpp>
+
+#include <windows.storage.h>
 
 using namespace OcvImageProcessing;
 
@@ -18,6 +22,7 @@ using namespace concurrency;
 using namespace Platform;
 using namespace Windows::Foundation;
 using namespace Windows::Storage::Streams;
+using namespace Windows::Storage;
 using namespace Windows::UI::Xaml::Media::Imaging;
 using namespace Windows::Graphics::Imaging;
 using namespace Windows::Foundation::Collections;
@@ -37,6 +42,17 @@ MainPage::MainPage()
 {
     InitializeComponent();
 
+#ifdef __OPENCV_IMGCODECS_HPP__
+
+    // Image loading OpenCV way ... way more simple
+    cv::Mat image = cv::imread("Assets/Lena.png");
+    Lena = cv::Mat(image.rows, image.cols, CV_8UC4);
+    cvtColor(image, Lena, CV_BGR2BGRA);
+    UpdateImage(Lena);
+
+#else
+
+    // Image loading WinRT way
     RandomAccessStreamReference^ streamRef = RandomAccessStreamReference::CreateFromUri(InputImageUri);
 
     task<IRandomAccessStreamWithContentType^> (streamRef->OpenReadAsync()).
@@ -68,6 +84,67 @@ MainPage::MainPage()
         memcpy(Lena.data, srcPixels->Data, 4*frameWidth*frameHeight);
         UpdateImage(Lena);
     });
+
+#endif
+}
+
+/// <summary>
+/// Temporary file creation example. Will be created in WinRT application temporary directory
+/// which usually is "C:\Users\{username}\AppData\Local\Packages\{package_id}\TempState\{random_name}.{suffix}"
+/// </summary>
+/// <param name="suffix">Temporary file suffix, e.g. "tmp"</param>
+std::string OcvImageProcessing::MainPage::CreateTempFile(const std::string &suffix) {
+    return cv::tempfile(suffix.c_str());
+}
+
+/// <summary>
+/// Creating/writing a file in the application local directory
+/// </summary>
+/// <param name="path">Image to save</param>
+bool OcvImageProcessing::MainPage::SaveImage(cv::Mat image) {
+    StorageFolder^ localFolderRT = ApplicationData::Current->LocalFolder;
+    cv::String localFile = ConvertPath(ApplicationData::Current->LocalFolder->Path) + "\\Lena.png";
+
+    return cv::imwrite(localFile, image);
+}
+
+/// <summary>
+/// Getting std::string from managed string via std::wstring.
+/// Provides an example of three ways to do it.
+/// Can't use this one: https://msdn.microsoft.com/en-us/library/bb384865.aspx, not available on WinRT.
+/// </summary>
+/// <param name="path">Path to be converted</param>
+cv::String OcvImageProcessing::MainPage::ConvertPath(Platform::String^ path) {
+    std::wstring localPathW(path->Begin());
+
+    // Opt #1
+    //std::string localPath(localPathW.begin(), localPathW.end());
+
+    // Opt #2
+    //std::string localPath(StrToWStr(localPathW));
+
+    // Opt #3
+    size_t outSize = localPathW.length() + 1;
+    char* localPathC = new char[outSize];
+    size_t charsConverted = 0;
+    wcstombs_s(&charsConverted, localPathC, outSize, localPathW.c_str(), localPathW.length());
+    cv::String localPath(localPathC);
+
+    // Implicit conversion from std::string to cv::String
+    return localPath;
+}
+
+std::string OcvImageProcessing::MainPage::StrToWStr(const std::wstring &input) {
+    if (input.empty()) {
+        return std::string();
+    }
+
+    int size = WideCharToMultiByte(CP_UTF8, 0, &input[0], (int)input.size(), NULL, 0, NULL, NULL);
+    std::string result(size, 0);
+
+    WideCharToMultiByte(CP_UTF8, 0, &input[0], (int)input.size(), &result[0], size, NULL, NULL);
+
+    return result;
 }
 
 /// <summary>
@@ -91,15 +168,16 @@ void OcvImageProcessing::MainPage::UpdateImage(const cv::Mat& image)
 
     // Obtain IBufferByteAccess
     ComPtr<IBufferByteAccess> pBufferByteAccess;
-    ComPtr<IUnknown> pBuffer((IUnknown*)buffer);
+    ComPtr<IInspectable> pBuffer((IInspectable*)buffer);
     pBuffer.As(&pBufferByteAccess);
 
     // Get pointer to pixel bytes
     pBufferByteAccess->Buffer(&dstPixels);
-    memcpy(dstPixels, image.data, 4*image.cols*image.rows);
+    memcpy(dstPixels, image.data, image.step.buf[1]*image.cols*image.rows);
 
     // Set the bitmap to the Image element
-    PreviewWidget->Source = bitmap;}
+    PreviewWidget->Source = bitmap;
+}
 
 
 cv::Mat OcvImageProcessing::MainPage::ApplyGrayFilter(const cv::Mat& image)
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.h b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.h
index 79c1ac74c6..bb7c4c33d5 100644
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.h
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/MainPage.xaml.h
@@ -39,6 +39,11 @@ namespace OcvImageProcessing
         cv::Mat ApplySepiaFilter(const cv::Mat& image);
 
         void UpdateImage(const cv::Mat& image);
+        std::string CreateTempFile(const std::string &suffix);
+        bool SaveImage(cv::Mat image);
+
+        std::string StrToWStr(const std::wstring &wstr);
+        cv::String ConvertPath(Platform::String^ path);
 
         cv::Mat Lena;
         unsigned int frameWidth, frameHeight;
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/opencv.props b/samples/winrt/OcvImageProcessing/OcvImageProcessing/opencv.props
index 40eaffd1f0..64b0ac98aa 100644
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/opencv.props
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/opencv.props
@@ -17,6 +17,9 @@
     <None Include="$(OpenCV_Bin)opencv_imgproc300$(DebugSuffix).dll">
       <DeploymentContent>true</DeploymentContent>
     </None>
+    <None Include="$(OpenCV_Bin)opencv_imgcodecs300$(DebugSuffix).dll">
+      <DeploymentContent>true</DeploymentContent>
+    </None>
     <None Include="$(OpenCV_Bin)opencv_features2d300$(DebugSuffix).dll">
       <DeploymentContent>true</DeploymentContent>
     </None>
@@ -33,7 +36,7 @@
     </ClCompile>
     <Link>
       <!--Add required OpenCV libs here-->
-      <AdditionalDependencies>opencv_core300$(DebugSuffix).lib;opencv_imgproc300$(DebugSuffix).lib;opencv_features2d300$(DebugSuffix).lib;opencv_flann300$(DebugSuffix).lib;opencv_ml300$(DebugSuffix).lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>opencv_core300$(DebugSuffix).lib;opencv_imgproc300$(DebugSuffix).lib;opencv_features2d300$(DebugSuffix).lib;opencv_flann300$(DebugSuffix).lib;opencv_ml300$(DebugSuffix).lib;opencv_imgcodecs300$(DebugSuffix).lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(OpenCV_Lib);%(AdditionalLibraryDirectories);</AdditionalLibraryDirectories>
     </Link>
   </ItemDefinitionGroup>